In [1]:
#Surpress the warnings
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

# Import Play Tennis Data 

In [2]:
 
import pandas as pd
df_tennis = pd.read_csv('C:/Users/Nikhi/Downloads/PlayTennis.csv')
#print("\n Given Play Tennis Data Set:\n\n", df_tennis)

In [3]:
df_tennis.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
PlayTennis,No,No,Yes,Yes,Yes
Outlook,Sunny,Sunny,Overcast,Rain,Rain
Temperature,Hot,Hot,Hot,Mild,Cool
Humidity,High,High,High,High,Normal
Wind,Weak,Strong,Weak,Weak,Weak


In [4]:
df_tennis.shape

(14, 6)

In [5]:
#Inspect Null values
df_tennis.isnull().sum()

Unnamed: 0     0
PlayTennis     0
Outlook        0
Temperature    0
Humidity       0
Wind           0
dtype: int64

# Entropy of the Training Data Set

In [6]:
#Function to calculate the entropy of probaility of observations
# -p*log2*p

def entropy(probs):  
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )

#Function to calulate the entropy of the given Data Sets/List with respect to target attributes
def entropy_of_list(a_list):  
    #print("A-list",a_list)
    from collections import Counter
    cnt = Counter(x for x in a_list)   # Counter calculates the propotion of class
   # print("\nClasses:",cnt)
    #print("No and Yes Classes:",a_list.name,cnt)
    num_instances = len(a_list)*1.0   # = 14
    print("\n Number of Instances of the Current Sub Class is {0}:".format(num_instances ))
    probs = [x / num_instances for x in cnt.values()]  # x means no of YES/NO
    print("\n Classes:",min(cnt),max(cnt))
    print(" \n Probabilities of Class {0} is {1}:".format(min(cnt),min(probs)))
    print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))
    return entropy(probs) # Call Entropy :
    
# The initial entropy of the YES/NO attribute for our dataset.
print("\n  INPUT DATA SET FOR ENTROPY CALCULATION:\n", df_tennis['PlayTennis'])

total_entropy = entropy_of_list(df_tennis['PlayTennis'])

print("\n Total Entropy of PlayTennis Data Set:",total_entropy)


  INPUT DATA SET FOR ENTROPY CALCULATION:
 0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: PlayTennis, dtype: object

 Number of Instances of the Current Sub Class is 14.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.35714285714285715:
 
 Probabilities of Class Yes is 0.6428571428571429:

 Total Entropy of PlayTennis Data Set: 0.9402859586706309


# Information Gain of Attributes 

In [7]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ",split_attribute_name)
    
    # Split Data by Possible Vals of Attribute:
    df_split = df.groupby(split_attribute_name)
  
    nobs = len(df.index) * 1.0
   # print("NOBS",nobs)
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    
    df_agg_ent.columns = ['Entropy', 'PropObservations']
   
    
    # Calculate Information Gain:
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy


print('Info-gain for Outlook is :'+str( information_gain(df_tennis, 'Outlook', 'PlayTennis')),"\n")
print('\n Info-gain for Humidity is: ' + str( information_gain(df_tennis, 'Humidity', 'PlayTennis')),"\n")
print('\n Info-gain for Wind is:' + str( information_gain(df_tennis, 'Wind', 'PlayTennis')),"\n")
print('\n Info-gain for Temperature is:' + str( information_gain(df_tennis, 'Temperature','PlayTennis')),"\n")

Information Gain Calculation of  Outlook

 Number of Instances of the Current Sub Class is 4.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.4:
 
 Probabilities of Class Yes is 0.6:

 Number of Instances of the Current Sub Class is 5.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.4:
 
 Probabilities of Class Yes is 0.6:

 Number of Instances of the Current Sub Class is 14.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.35714285714285715:
 
 Probabilities of Class Yes is 0.6428571428571429:
Info-gain for Outlook is :0.2467498197744391 

Information Gain Calculation of  Humidity

 Number of Instances of the Current Sub Class is 7.0:

 Classes: No Yes
 
 Probabilities of Class No is 0.42857142857142855:
 
 Probabilities of Class Yes is 0.5714285714285714:

 Number of Instances of the Current Sub Class is 7.0:

 Classes:

# ID3 Algorithm

In [8]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    
    ## Tally target attribute:
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    
    ## First check: Is this split of the dataset homogeneous?
    if len(cnt) == 1:
        return next(iter(cnt)) 
    
  
    elif df.empty or (not attribute_names):
        return default_class  # Return None for Empty Data Set
    
    
    else:
        
        default_class = max(cnt.keys()) #No of YES and NO Class
       
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] #
        index_of_max = gainz.index(max(gainz)) # Index of Best Attribute
       
        best_attr = attribute_names[index_of_max]
        
        # Create an empty tree, to be populated in a moment
        tree = {best_attr:{}} # Iniiate the tree with best attribute as a node 
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        
        # Split dataset
      
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree

# Predicting Attributes

In [9]:
# Get Predictor Names (all but 'class')
attribute_names = list(df_tennis.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('PlayTennis') #Remove the class attribute 
print("Predicting Attributes:", attribute_names)

List of Attributes: ['Unnamed: 0', 'PlayTennis', 'Outlook', 'Temperature', 'Humidity', 'Wind']
Predicting Attributes: ['Unnamed: 0', 'Outlook', 'Temperature', 'Humidity', 'Wind']


# Tree Construction

In [10]:
# Run Algorithm:
from pprint import pprint
tree = id3(df_tennis,'PlayTennis',attribute_names)
print("\n\nThe Resultant Decision Tree is :\n")
#print(tree)
pprint(tree)
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())

Information Gain Calculation of  Unnamed: 0

 Number of Instances of the Current Sub Class is 1.0:

 Classes: No No
 
 Probabilities of Class No is 1.0:
 
 Probabilities of Class No is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: No No
 
 Probabilities of Class No is 1.0:
 
 Probabilities of Class No is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: No No
 
 Probabilities of Class No is 1.0:
 
 Probabilities of Class No is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Ye

# Classification Accuracy

In [11]:
def classify(instance, tree, default=None): 
    
    #print("Instance:",instance)
    attribute = next(iter(tree))        
    print("Key:",tree.keys())  
    print("Attribute:",attribute) 
   
    # print("Insance of Attribute :",instance[attribute],attribute)
    if instance[attribute] in tree[attribute].keys():  
        result = tree[attribute][instance[attribute]]
        print("Instance Attribute:",instance[attribute],"TreeKeys :",tree[attribute].keys())
        if isinstance(result, dict): # this is a tree, delve deeper
            return classify(instance, result)
        else:
            return result # this is a label
    else:
        return default

In [12]:
df_tennis['predicted'] = df_tennis.apply(classify, axis=1, args=(tree,'No') ) 
   

print(df_tennis['predicted'])

print('\n Accuracy is:\n' + str( sum(df_tennis['PlayTennis']==df_tennis['predicted'] ) / (1.0*len(df_tennis.index)) ))


df_tennis[['PlayTennis', 'predicted']]


Key: dict_keys(['Unnamed: 0'])
Attribute: Unnamed: 0
Instance Attribute: 0 TreeKeys : dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
Key: dict_keys(['Unnamed: 0'])
Attribute: Unnamed: 0
Instance Attribute: 1 TreeKeys : dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
Key: dict_keys(['Unnamed: 0'])
Attribute: Unnamed: 0
Instance Attribute: 2 TreeKeys : dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
Key: dict_keys(['Unnamed: 0'])
Attribute: Unnamed: 0
Instance Attribute: 3 TreeKeys : dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
Key: dict_keys(['Unnamed: 0'])
Attribute: Unnamed: 0
Instance Attribute: 4 TreeKeys : dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
Key: dict_keys(['Unnamed: 0'])
Attribute: Unnamed: 0
Instance Attribute: 5 TreeKeys : dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
Key: dict_keys(['Unnamed: 0'])
Attribute: Unnamed: 0
Instance Attribute: 6 TreeKeys : dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1

Unnamed: 0,PlayTennis,predicted
0,No,No
1,No,No
2,Yes,Yes
3,Yes,Yes
4,Yes,Yes
5,No,No
6,Yes,Yes
7,No,No
8,Yes,Yes
9,Yes,Yes


# Classification Accuracy: Training/Testing Set

In [13]:
training_data = df_tennis.iloc[1:-4] 
test_data  = df_tennis.iloc[-4:] 
train_tree = id3(training_data, 'PlayTennis', attribute_names)

test_data['predicted2'] = test_data.apply(                               
                                          classify, 
                                          axis=1, 
                                          args=(train_tree,'Yes') ) 


print ('\n\n Accuracy is : ' + str( sum(test_data['PlayTennis']==test_data['predicted2'] ) / (1.0*len(test_data.index)) ))

Information Gain Calculation of  Unnamed: 0

 Number of Instances of the Current Sub Class is 1.0:

 Classes: No No
 
 Probabilities of Class No is 1.0:
 
 Probabilities of Class No is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: No No
 
 Probabilities of Class No is 1.0:
 
 Probabilities of Class No is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes: Yes Yes
 
 Probabilities of Class Yes is 1.0:
 
 Probabilities of Class Yes is 1.0:

 Number of Instances of the Current Sub Class is 1.0:

 Classes