In [1]:
import numpy as np
import pandas as pd

In [16]:
df = pd.read_csv("weather.csv").drop("id", axis = 1)
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
df

Unnamed: 0,outlook,temperature,humidity,wind,play
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rainy,mild,high,weak,yes
4,rainy,cool,normal,weak,yes
5,rainy,cool,normal,strong,no
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cool,normal,weak,yes
9,rainy,mild,normal,weak,yes


In [3]:
attributes_and_values = {}
for column in df.columns.values[:-1]:
    attributes_and_values[column] = list(set(df[column].values))
all_class = ["yes", "no"]
attributes_and_values

{'outlook': ['sunny', 'rainy', 'overcast'],
 'temperature': ['hot', 'mild', 'cool'],
 'humidity': ['normal', 'high'],
 'wind': ['strong', 'weak']}

In [4]:
def entropy(ids):
    entropy = 0
    all_target = [y[id] for id in ids]
    for item in all_class:
        item_count = all_target.count(item)
        if item_count != 0:
            prob = item_count *1./len(all_target)
            entropy -= prob * np.log(prob)
    return entropy

def attribute_entropy(ids, attribute):
    attribute_entropy = 0
    values_and_ids = []
    for id in ids:
        values = list(df.iloc[id: id + 1][attribute].values)
        values_and_ids.append((id, values[0]))
    for value in attributes_and_values[attribute]:
        all_ids = []
        for id in range(len(values_and_ids)):
            if values_and_ids[id][1] == value:
                all_ids.append(values_and_ids[id][0])
        prob = len(all_ids) * 1./len(ids)
        attribute_entropy += prob * entropy(all_ids)
    return attribute_entropy

def most_suitable_attribute(ids):
    IGs = [] # list of information gain of all attributes 
    information_gain, attribute_selected = -1000, ""
    for attribute in attributes_and_values:
        new_information_gain = entropy(ids) - attribute_entropy(ids, attribute)
        if new_information_gain > information_gain:
            information_gain = new_information_gain
            attribute_selected = attribute
    return attribute_selected

def build_tree(ids):
    if entropy(ids) == 0:
        return y[ids[0]]
    else:
        node_attribute = most_suitable_attribute(ids)
        tree = {}
        tree[node_attribute] = {}
        for value in attributes_and_values[node_attribute]:
            all_ids = []
            for id in ids:
                if df.iloc[id: id + 1][node_attribute].values[0] == value:
                    all_ids.append(id)
            if len(all_ids):
                if entropy(all_ids) == 0:
                    tree[node_attribute][value] = y[all_ids[0]]
                else:
                    tree[node_attribute][value] = build_tree(all_ids)
        return tree

In [6]:
tree = build_tree(df.index)
tree

{'outlook': {'sunny': {'humidity': {'normal': 'yes', 'high': 'no'}},
  'rainy': {'wind': {'strong': 'no', 'weak': 'yes'}},
  'overcast': 'yes'}}

In [18]:
def predict(instance, tree):
    for node in tree.keys():
        value = instance[node]
        new_tree = tree[node][value]
        if type(new_tree) is dict:
            prediction = predict(instance, new_tree)
        else:
            prediction = new_tree
            break
    return prediction

y_pred = []
for id in df.index:
    pred = predict(df.iloc[id, :], tree)
    y_pred.append(pred)
y_pred

['no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no']