In [2]:
import pandas as pd
import numpy as np
import math

In [3]:
# initialise data of lists.
data = {'Name':['Pieter Bartlett', 'George Berg', 'Hiroshi Motoda', 'Filippo Neri', 'Robert Roos', 'Satinder Singh', 'Maja Mataric', 'Arun Sharma', 'Michael Meystel'],
 'lenFirst':['Yes','Yes','Yes','Yes','Yes','Yes','No','No','Yes'],
 'lenLast':['Yes','No','Yes','No','No','No','Yes','Yes','Yes'],
 'sameFirst':['No','No','No','No','Yes','Yes','Yes','No','Yes'],
 'vowel':['e','e','o','i','o','i','a','a','e'],
 'Badge':['+','-','+','-','+','+','-','-','+']}
 
# Create DataFrame
df = pd.DataFrame(data)
df = df.set_index('Name')

In [4]:
def calculate_entropy(predicted_column):
    total = len(predicted_column)
    counts = predicted_column.value_counts().to_list()
    if len(counts) > 0:
        entropy = -sum((count/total) * math.log2(count/total) for count in counts)
    return entropy


In [5]:
def calculate_info_gain(df, info_gain_column_name, predicted_column_name):
    total_entropy = calculate_entropy(df[predicted_column_name])
    weighted_entropies = []
    for value in df[info_gain_column_name].unique():
        subset_entropy = calculate_entropy(df[df[info_gain_column_name] == value][predicted_column_name])
        weight = len(df[df[info_gain_column_name] == value])/len(df)
        weighted_entropies.append(subset_entropy * weight)
    info_gain = total_entropy - sum(weighted_entropies)
    return info_gain

In [6]:
def ID3(df, original_df, features, predicted_column, parent_node=None):
    if len(df[predicted_column].unique()) <= 1:
        return df[predicted_column].unique()[0]

    elif len(df) == 0:
        return original_df[predicted_column].unique()[original_df[predicted_column].value_counts().argmax()]
    
    else:
        parent_node = df[predicted_column].unique()[df[predicted_column].value_counts().argmax()]
    
    gains = [calculate_info_gain(df, feature, predicted_column) for feature in features]
    best_feature_index = np.argmax(gains)
    best_feature = features[best_feature_index]

    tree = {best_feature: {}}

    features = features.drop(best_feature)

    for value in df[best_feature].unique():
         #value = value
         sub_df = df[df[best_feature] == value]
         sub_tree = ID3(sub_df, df, features, predicted_column, parent_node)
         tree[best_feature][value] = sub_tree

    return tree



In [10]:
df.columns.drop('Badge')

Index(['lenFirst', 'lenLast', 'sameFirst', 'vowel'], dtype='object')

In [7]:
def predict(query, tree, default=1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]]
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result, dict):
                return(predict(query, result))
            else:
                return result

In [8]:
tree = ID3(df, df, df.columns.drop('Badge'), 'Badge')

print(tree)

{'vowel': {'e': {'lenLast': {'Yes': '+', 'No': '-'}}, 'o': '+', 'i': {'sameFirst': {'No': '-', 'Yes': '+'}}, 'a': '-'}}


In [9]:
def test(data, tree):
    queries = data.iloc[:,:-1].to_dict(orient='records')
    predicted = pd.DataFrame(columns=['predicted'])

    for i in range(len(df)):
        predicted.loc[i, 'predicted'] = predict(queries[i],tree,1.0)
    return predicted

stuff = test(df, tree)
print(stuff)

  predicted
0         +
1         -
2         +
3         -
4         +
5         +
6         -
7         -
8         +


In [20]:
def calculate_accuracy(predictions, df_labels): 
    correct = np.sum(predictions == df_labels)
    total = len(df_labels)
    return round((correct / total), 3)


In [21]:
correct = calculate_accuracy(stuff['predicted'].reset_index(drop=True), df['Badge'].reset_index(drop=True))
print(correct)

1.0


In [6]:
output = calculate_entropy(df['Badge'])
display(output)

stuff = calculate_info_gain(df, 'lenFirst', 'Badge')




print(stuff)



0.7642045065086203

0.7642045065086203
