# DECISION TREE IMPLEMENTATION

In [5]:
!pip install pandas numpy matplotlib seaborn scikit-learn

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



In [6]:
import os

# Get the current working directory
cwd = os.getcwd()
print(f"Current Working Directory: {cwd}")
df = pd.read_csv('C:/Users/keerthana.r/Downloads/golf-dataset.csv')

# Display the first few rows
df

Current Working Directory: C:\Users\keerthana.r


Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


In [7]:
# Function to calculate entropy

def entropy(var):
    N=var.shape[0]  #number of rows in var (if var = df, then N=14)
    
    values,counts=np.unique(var,return_counts=True)
    #print("values and counts",values,counts)
    
    ent=0.0
    for i in counts:
        p=i/N
        ent+=(p*np.log2(p)) #np.log2(x): log function in numpy
    return -ent

In [8]:
# Function to Split the data

def splitData(data,feature):
    nodeVal = {}      # creating dictionary to store given feature values
    occurence = list(data[feature].value_counts())
    feat_values = list(data[feature].value_counts().index)
    #print("occurence :",occurence)
    #print("feat_values :",feat_values)
    
    for val in feat_values:
        nodeVal[val] = {'data' : pd.DataFrame([], columns = data.columns), 'len':0} # creating dict inside nodeval dict
        #print("nodeVal[val]:",nodeVal[val])
        
    for ix in range(data.shape[0]):
        val = data[feature].iloc[ix]

        nodeVal[val]['data'] = pd.concat([nodeVal[val]['data'], data.iloc[[ix]]], ignore_index=True)
        #print("nodeVal[val]['data']:",nodeVal[val]['data'])
        
        idx = feat_values.index(val)
        #print("idx",idx)
        nodeVal[val]['len'] = occurence[idx]
        #print("nodeVal[val]['len']",nodeVal[val]['len'])
    return nodeVal
    
dataSplitted = splitData(df,"Outlook")

In [11]:
# Information Gain calculation function 

def information_gain(data,feature):
    examples = data.shape[0]
    
    DATA = splitData(data,feature)
    
    keys = DATA.keys()
    
    ent_of_children = 0.0
    
    for key in keys:
        ent_of_children += ((DATA[key]['len']/examples)* entropy(DATA[key]['data']['Play Golf']))
    
    info_gain = entropy(data['Play Golf'])
    return info_gain
        

In [20]:

class DecisionTree:
    #constructor
    def __init__(self, depth=0, max_depth=5):
        #creation of node
        self.children = {}
        self.fkey = None
        self.depth = depth
        self.max_depth = max_depth
        self.target = None
        
    #training
    def train(self, data):
        features = ['Outlook','Temp','Humidity','Windy']
        
        info_gains = []
        
        for f in features:
            i_gain = information_gain(data,f)
            info_gains.append(i_gain)
            
        # finding the best features
        self.fkey = features[np.argmax(info_gains)]
        
        #splitting the data
        DATA = splitData(data,self.fkey)
        #print("DATA after splitting :", DATA)
        
        
        ###### Finding the target of each node (ex: Finding either YES or NO is majority at each node. At first node say outlook ,there are 9-Yes and 5-No, so at outlook node target is YES)
        labels = list(data["Play Golf"].value_counts().index)
        freq = list(data["Play Golf"].value_counts().values)
        
        self.target = labels[np.argmax(freq)]
        
        
        ############# stoping conditions ########################
        have_data = 0
        keys = DATA.keys()
        
        for key in keys:
            if DATA[key]['len'] > 0:
                have_data +=1
                
        # 1. if it is a pure node        
        if have_data<2:
            return
         
        # 2. if the model reached max depth
        if self.depth > self.max_depth:
            return
        
        print("\t"*self.depth + "Making Tree with - ", self.fkey)
        
        ############### recursively train child node ##################
        for key in keys:
            new_data = DATA[key]['data']
            self.children[key] = DecisionTree(depth = self.depth + 1)
            self.children[key].train(new_data)
            
        return

In [21]:
model = DecisionTree()
model.train(df)