In [1]:
import pandas as pd 
import numpy as np

In [2]:
def find_entropy(df):
        target = df.keys()[-1]
        target_values = df[target].unique()
        entropy=0
        for value in target_values:
            prob = len(df[df[target]==value])/len(df)
            entropy += -(prob*np.log2(prob))
        return entropy

In [3]:
df = pd.read_csv('./weather.csv')
df.head()

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play
0,rainy,hot,high,0,0
1,rainy,hot,high,1,0
2,overcast,hot,high,0,1
3,sunny,mild,high,0,1
4,sunny,cool,normal,0,1


In [4]:
def find_average_info_entropy(df, attribute):
    target = df.keys()[-1]
    target_values = df[target].unique()
    attr_values = df[attribute].unique()
    average_info_entropy = 0
    for value1 in attr_values:
        entropy_subsample = 0
        for value2 in target_values:
            num = len(df[attribute][df[attribute]==value1][df[target]==value2])
            den = len(df[attribute][df[attribute]==value1])
            prob = num/den
            entropy_subsample += -(prob*np.log2(prob+1e-7)) 
        weight = den/len(df)
        average_info_entropy += weight*entropy_subsample        
    return average_info_entropy

In [5]:
find_entropy(df)

0.9402859586706311

In [6]:
find_average_info_entropy(df, 'Outlook')

0.6935358915770655

In [7]:
def find_winner(df):
    attributes = df.keys()[:-1]
    IG = []
    for attribute in attributes:
        IG.append(find_entropy(df) - find_average_info_entropy(df, attribute))
    # print(IG)
    return df.keys()[:-1][np.argmax(IG)]

In [8]:
find_winner(df)

'Outlook'

In [9]:
def training(df, tree=None):
    split_attribute = find_winner(df)
    target = df.keys()[-1]
    if tree is None:
        tree = {}
        tree[split_attribute] = {}
    split_attribute_values = df[split_attribute].unique()
    for value in split_attribute_values:
        sub_df = df[df[split_attribute]==value].reset_index(drop=True) #index ko drop kra
        unique_values = sub_df[target].unique()
        if len(unique_values) == 1:
            tree[split_attribute][value] = unique_values[0]
        else:
            tree[split_attribute][value] = training(sub_df)
    return tree

In [10]:
tree = training(df)
tree

{'Outlook': {'rainy': {'Humidity': {'high': 0, 'normal': 1}},
  'overcast': 1,
  'sunny': {'Windy': {0: 1, 1: 0}}}}

In [11]:
def predict(inst, tree):
    for nodes in tree.keys():
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break
    return prediction 

In [12]:
df1 = pd.read_csv('./weather_test.csv')
Y_label = []
for i in range(len(df1)):
    inst = df1.iloc[i,:]
    prediction = predict(inst, tree)
    Y_label.append(prediction)

In [13]:
Y_label

[0, 1]

In [14]:
from sklearn import metrics
print(metrics.classification_report(df1.iloc[:,-1],Y_label))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [15]:
def visualizer(root, indent=0):
    if type(root) == dict:
        for k, v in root.items():
            print(" "*indent + f"{k}:")
            visualizer(v, indent+2)
    else:
        print(" "*indent + repr(root))

In [16]:
visualizer(tree)

Outlook:
  rainy:
    Humidity:
      high:
        0
      normal:
        1
  overcast:
    1
  sunny:
    Windy:
      0:
        1
      1:
        0
