In [1]:
import pandas as pd
import math
import numpy as np

# TREE CLASS

In [2]:
class TreeNode:
    def __init__(self):
        self.data = None
        self.children = None
        self.parent = None
        self.children_Node = []
        

# DATASET

## Train Dataset

In [3]:
dataset = pd.read_excel('dataset.xlsx')

In [4]:
dataset.drop(['DAY'], axis=1,inplace=True)
dataset.head()

Unnamed: 0,OUTLOOK,TEMPERATURE,HUMIDITY,WIND,PLAY BALL
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [5]:
label = list(dataset.columns)[-1]
classify = dataset[label].unique()
classify_num = dataset[label].value_counts().to_dict()

In [6]:
data = dataset.values.tolist()

## Prediction Dataset

In [7]:
df = pd.read_excel("prediction.xlsx")
df.head()

Unnamed: 0,DAY,OUTLOOK,TEMPERATURE,HUMIDITY,WIND,PLAY BALL
0,D1,Sunny,Hot,High,Weak,No
1,D2,Overcast,Mild,Normal,Weak,Yes


In [8]:
df = df.drop('DAY', axis=1)
otp = df['PLAY BALL']
df = df.drop('PLAY BALL', axis = 1)

# ALGORITHM

## TREE CREATION

In [9]:
def createTree(root, dataset, func):
    root.data = func(dataset)
    if root.data != list(dataset[label].unique())[0]:
        root.children = list(dataset[root.data].unique())
    if root.children is None or root.data is None:
        return None
    for col in root.children:
        branch = TreeNode()
        branch.parent = root
        createTree(branch, dataset[dataset[root.data] == col].drop(root.data, axis = 1), func)
        root.children_Node.append(branch)
    return root

In [10]:
def LevelOrderTraversal(root):
    if (root == None):
        return
    q = []
    q.append(root)
    while (len(q) != 0):
        n = len(q)
        while (n > 0):
            p = q[0]
            q.pop(0)
            print(p.data+'\n', end=' ')
            print(p.children, end=' ')
            for i in range(len(p.children_Node)):
                q.append(p.children_Node[i])
            n -= 1
        print() 

## ID3 ALGORITHM

In [11]:
def entropy_value(df):
    classify_num = df[label].value_counts().to_dict()
    total = np.sum(list(classify_num.values()))
    
    nsum = [x*(math.log2(x/total)) for x in list(classify_num.values())]
    
    result = -1*np.sum(nsum)/total
    
    return result


In [12]:
entropy_dataset = entropy_value(dataset)

In [13]:
def entropy_attribute(dataset):
    
    attr = list(dataset.drop([label], axis=1).columns)
    total = np.sum(list(classify_num.values()))
    gain_dict = {}
    for col in attr:
        gain_dict[col] = 0
    
    for col in attr:
        entropy = 0
        for val in list(dataset[col].unique()):
            entropy += (np.sum(dataset[dataset[col] == val][label].value_counts().to_list())/total) * entropy_value(dataset[dataset[col] == val])
        gain_dict[col] = entropy_dataset  - entropy
    
    if len(gain_dict) > 0:
        if max(gain_dict.values()) == entropy_dataset:
            return list(dataset[label].unique())[0] 
        else:
            return [atr for atr in gain_dict.keys() if gain_dict[atr] == max(gain_dict.values())][0]
    return list(dataset[label].unique())[0]


In [14]:
def ID3_train(dataset):
    global root
    root = TreeNode()
    createTree(root, dataset, entropy_attribute)

## GINI INDEX ALGORITHM

In [15]:
def gini_value(dataframe):
    classify_num = dataframe[label].value_counts().to_dict()
    total = np.sum(list(classify_num.values()))
    
    nsum = [np.square(x/total) for x in list(classify_num.values())]
    
    result = 1-np.sum(nsum)
    
    return result


In [16]:
gini_dataset = gini_value(dataset)

In [17]:
def gini_attribute(dataset):
    
    attr = list(dataset.drop([label], axis=1).columns)
    total = np.sum(list(classify_num.values()))
    gain_dict = {}
    for col in attr:
        gain_dict[col] = 0
    
    for col in attr:
        gini = 0
        for val in list(dataset[col].unique()):
            gini += (np.sum(dataset[dataset[col] == val][label].value_counts().to_list())/total) * gini_value(dataset[dataset[col] == val])
        gain_dict[col] = gini
    
    if len(gain_dict) > 0:
        if min(gain_dict.values()) == 0:
            return list(dataset[label].unique())[0]
        else:
            return [atr for atr in gain_dict.keys() if gain_dict[atr] == min(gain_dict.values())][0]
    return list(dataset[label].unique())[0]

In [18]:
def GINI_train(dataset):
    global root
    root = TreeNode()
    createTree(root, dataset, gini_attribute)

## PREDICTION ALGORITHM

In [19]:
def prediction(root, df):
    if root.data in df.columns:
        prev = root
        root = root.children_Node[root.children.index(df[root.data].unique()[0])]
    else:
        return root.data
    
    return prediction(root, df.drop(prev.data, axis = 1))

In [20]:
def predict(df):
    y_predict = []
    for row in df.iterrows():
        y_predict.append(prediction(root,pd.DataFrame(row[1]).T))
    return y_predict

# MENU DRIVEN PROGRAM

In [21]:
while True:
    option = int(input("1. Entropy\n2. Gini Index"))
    if option == 1:
        ID3_train(dataset)
        print(predict(df))
    elif option == 2:
        GINI_train(dataset)
        print(predict(df))
    cont = input("Do you want to Continue? (y/n) : ")
    if cont == 'n':
        break

['No', 'Yes']
['No', 'Yes']
