In [2]:
import numpy as np
import pandas as pd

In [86]:
class Node(object):
    def __init__(self,ids=None,children=[],entropy=0,depth=0):
        self.ids=ids
        self.entropy=entropy
        self.depth=depth
        self.split_attribute=None # chosen feature to divide into children
        self.children=children
        self.val_chosen=None      # val_chosen to save values for predictation
        self.label=None

    def set_val(self,split_attribute,val_chosen):
        self.split_attribute=split_attribute
        self.val_chosen=val_chosen    # function to save 2 val for predictation

    def set_label(self,label):
        self.label=label

class decision_tree_id3(object):

    def __init__(self,max_depth=10,min_sample_split=2,min_gain=1e-4):
        self.root=None
        self.max_depth=max_depth
        self.min_sample_split=min_sample_split
        self.Ntrain=0
        self.min_gain=min_gain

    def optimizer_step(self,data,target):
        self.attributes = list(data)    # name features of data
        self.Ntrain=data.count().iloc[0]   # number of data
        self.data=data
        self.target=target
        self.labels=target.unique()     
        ids=range(self.Ntrain)
        self.root=Node(ids=ids,entropy=self.entropy(ids),depth=0)
        queue=[self.root]
        while queue:
            node=queue.pop()
            if node.depth<self.max_depth or node.entropy<self.min_gain:
                node.children=self._split(node)
                if not node.children:
                    self._set_label(node)
                queue+=node.children
            else:
                self._set_label(node) 

    def _split(self,node):
        ids=node.ids
        best_gain=0
        best_splits=[]
        best_attributes=None
        val_chosen=None
        
        sub_data=self.data.iloc[ids,:] # sub data of this node
        for i, att in enumerate(self.attributes):   
            values=sub_data.iloc[:,i].unique().tolist()
            if len(values)==1: continue
            splits=[]
            for value in values:
                sub_ids=sub_data.index[sub_data[att]==value].tolist()
                splits.append(sub_ids)
            if min(map(len,splits))<self.min_sample_split : continue    # map function apply len function for each element of splits
            entropy_=0
            for split in splits:
                entropy_+=len(split)*self.entropy(split)/len(ids)
            gain=node.entropy-entropy_
            if gain<self.min_gain: continue
            if gain>best_gain:
                best_gain=gain
                best_attributes=att
                best_splits=splits
                val_chosen=values
        node.set_val(best_attributes,val_chosen) 
        child_nodes=[Node(ids=split,entropy=self.entropy(split),depth=node.depth+1)for split in best_splits]
        return child_nodes

    def _set_label(self,node): 
        target_ids=node.ids
        node.set_label(self.target.iloc[target_ids].mode().iloc[0])

    def entropy(self,ids):
        if len(ids) == 0: return 0
        freq = np.array(self.target.iloc[ids].value_counts()) # the array with the number of the occurrences of each value                  
        freq=freq[freq!=0]  #   delete 0 element(can't use in logarit function)
        prob=freq/float(np.sum(freq))  
        return -np.sum(prob*np.log(prob)) # apply entropy function
    
    def predict(self,new_data):
        npoints=new_data.count().iloc[0]
        labels=[None]*npoints
        for n in range(npoints):
            x=new_data.iloc[n,:]
            node=self.root
            while node.children:    # recurive until reach leaf node
                node=node.children[node.val_chosen.index(x[node.split_attribute])]
            labels[n]=node.label
        return labels
def arrcuracy(model,X,y):
    yhat=model.predict(X)
    sum=np.sum(np.where(y==yhat,1,0))
    return sum/len(y)
        

In [103]:
df1=pd.read_csv('Breast_Cancer.csv')
X = df1.iloc[:, :-1]
y = df1.iloc[:, -1]
df1.head()
df1.shape

(4024, 16)

In [104]:

model1=decision_tree_id3(max_depth=10,min_sample_split=3)
model1.optimizer_step(X,y)
print(arrcuracy(model1,X,y))

0.8655566600397614
