In [1]:
import csv
import math

In [2]:
def major_class(data,attributes,target):
    freq={}
    index = attributes.index(target)
    for t in data:
        if t[index] in freq:
            freq[t[index]] += 1
        else:
            freq[t[index]] = 1
    m = 0
    major =""
    for key in freq.keys():
        if freq[key] > m:
            m = freq[key]
            major = key
    return major

In [3]:
def entropy(attributes,data,targetAttr):
    freq ={}
    data_entropy=0.0
    i = 0
    for entry in attributes:
        if targetAttr == entry:
            break
        i += 1
    for entry in data:
        if entry[i] == 'PlayTennis':
            pass
        else:
            if entry[i] in freq:
                freq[entry[i]] += 1.0
            else:
                freq[entry[i]] =1.0
    for f in freq.values():
        data_entropy += (-f/len(data)) * math.log(f/len(data), 2)
    return data_entropy

In [4]:
def info_gain(data,attributes,targetAttr,attr):
    freq={}
    subset_entropy = 0.0
    i=attributes.index(attr)
    for entry in data:
        if entry[i]==attr:
            pass
        else:
            if entry[i] in freq:
                freq[entry[i]] += 1.0
            else:
                freq[entry[i]] = 1
    for val in freq.keys():
        p = sum(freq.values())
        val_prob = freq[val] / (p)
        data_subset = [entry for entry in data if entry[i] == val]
        subset_entropy += val_prob * entropy(attributes,data_subset,targetAttr)
    data_subset=[entry for entry in data if entry[0] != 'Outlook']
    Info_gain=entropy(attributes,data_subset,targetAttr) - subset_entropy
    return Info_gain

In [5]:
def attr_choose(data,attributes,target):
    best=attributes[0]
    max_gain=0
    for attr in attributes:
        if attr != target:
            new_gain=info_gain(data,attributes,target,attr)
            if new_gain > max_gain:
                max_gain = new_gain
                best = attr
    return best

In [6]:
def get_values(data,attributes,attr):
    i=attributes.index(attr)
    values=[]
    for entry in data:
        if entry[i] == attr:
            pass
        else:
            if entry[i] not in values:
                values.append(entry[i])
    return values

In [7]:
def get_data(data,attributes,best,val):
    new_data=[]
    i=attributes.index(best)
    for entry in data:
        if entry[i]==val:
            new_entry =[]
            for j in range(len(entry)):
                if j != i:
                    new_entry.append(entry[j])
            new_data.append(new_entry)
    return new_data

In [8]:
def build_tree(data,attributes,target):
    vals=[record[attributes.index(target)] for record in data]
    default = major_class(data,attributes,target)
    if not data or (len(attributes) - 1) <= 0:
        return default
    elif vals.count(vals[0])==len(vals):
        return vals[0]
    else:
        best=attr_choose(data,attributes,target)
        tree={best:{}}
        for val in get_values(data,attributes,best):
            new_data=get_data(data,attributes,best,val)
            new_attr=attributes[:]
            new_attr.remove(best)
            subtree=build_tree(new_data,new_attr,target)
            tree[best][val]=subtree
        return tree

In [9]:
def test(attributes,instance,tree):
    attributes=next(iter(tree))
    i = attributes.index(attributes)
    if instance[i] in tree[attributes].keys():
        result = tree[attributes][instance[i]]
        if isinstance(result,dict):
            return test(attributes,instance,result)
        else:
            return result
    else:
        return 'NULL'

In [10]:
def execute_decision_tree():
    data=[]
    with open('PlayTennis.csv') as tsv:
        for line in csv.reader(tsv):
            data.append(tuple(line))
        attributes=list(data[0])
        target=attributes[-1]
        training_set=[x for i,x in enumerate(data)]
        print("DATA SET IS:")
        print(training_set)
        print()
        
        tree=build_tree(training_set,attributes,target)
        
        print('Decision Tree is as below: \n')
        print(tree)
        instance=['Sunny','Hot','High','Weak']
        
        print("*****************")
        print('Testing instance is:',instance)
        result=test(attributes,instance,tree)
        print('The Target value for the testing instance is:')
        print(result)

In [11]:
execute_decision_tree()

DATA SET IS:
[('Outlook', 'Temperature', 'Humidity', 'Windy', 'PlayTennis'), ('Sunny', 'Hot', 'High', 'FALSE', 'No'), ('Sunny', 'Hot', 'High', 'TRUE', 'No'), ('Overcast', 'Hot', 'High', 'FALSE', 'Yes'), ('Rainy', 'Mild', 'High', 'FALSE', 'Yes'), ('Rainy', 'Cool', 'High', 'FALSE', 'Yes'), ('Rainy', 'Cool', 'Normal', 'TRUE', 'No'), ('Overcast', 'Cool', 'Normal', 'TRUE', 'Yes'), ('Sunny', 'Mild', 'High', 'FALSE', 'No'), ('Sunny', 'Cool', 'Normal', 'FALSE', 'Yes'), ('Rainy', 'Mild', 'Normal', 'FALSE', 'Yes'), ('Sunny', 'Mild', 'Normal', 'TRUE', 'Yes'), ('Overcast', 'Mild', 'High', 'TRUE', 'Yes'), ('Overcast', 'Hot', 'Normal', 'FALSE', 'Yes'), ('Rainy', 'Mild', 'High', 'TRUE', 'No')]

Decision Tree is as below: 

{'Outlook': {'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}, 'Overcast': 'Yes', 'Rainy': {'Windy': {'TRUE': 'No', 'FALSE': 'Yes'}}}}
*****************
Testing instance is: ['Sunny', 'Hot', 'High', 'Weak']
The Traget value for the testing instance is:
NULL
