# Decision Tree

- Supervised Learning Algorithm
- Graphical Structure
- Predictive Model - Reccursive and Greedy
- Easy to Interpret



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv("/content/haberman.csv", header=None)

In [None]:
df.columns = ['Age','OP_year','Nodes','output']

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Age      306 non-null    int64
 1   OP_year  306 non-null    int64
 2   Nodes    306 non-null    int64
 3   output   306 non-null    int64
dtypes: int64(4)
memory usage: 9.7 KB


# Entropy

- Quantity of Randomness
- 0 -> All True/False
- 1 -> Equal True/False

In [None]:
def entropy(col):
  counts = np.unique(col,return_counts=True)
  ent = 0.0
  for ix in counts[1]:
    p = ix/col.shape[0]
    ent += (-1.0*p*np.log2(p))
  return ent

In [None]:
def divide_data(x_data,nkey,mval):
  x_right = pd.DataFrame([],columns=x_data.columns)
  x_left = pd.DataFrame([],columns=x_data.columns)
  for ix in range(x_data.shape[0]):
    val = x_data[nkey].loc[ix]
    if val >= mval:
      x_right = x_right.append(x_data.iloc[ix])
    else:
      x_left = x_left.append(x_data.iloc[ix])
  return x_right,x_left

In [None]:
def info_gain(x_data,nkey,mval):
  right,left = divide_data(x_data,nkey,mval)

  l = float(left.shape[0])/x_data.shape[0]
  r = float(right.shape[0])/x_data.shape[0]

  if left.shape[0]==0 or right.shape[0]==0:
    return -1e5
  i_gain = entropy(x_data.output) - (l * entropy(left.output) + r*entropy(right.output))
  return i_gain

In [None]:
class DecisionTree:
  def __init__(self,depth=0,max_depth=5):
    self.left = None
    self.right = None
    self.nkey = None
    self.mval = None
    self.depth = depth
    self.max_depth = max_depth
    self.target = None
  def train(self,x_train):
    features = ['Age','OP_year','Nodes']
    info_gains = []
    for ix in features:
      i_gain = info_gain(x_train,ix,x_train[ix].mean())
      info_gains.append(i_gain)
    self.nkey = features[np.argmax(info_gains)]
    self.mval = x_train[self.nkey].mean()
    data_right,data_left = divide_data(x_train,self.nkey,self.mval)
    data_right = data_right.reset_index(drop=True)
    data_left = data_left.reset_index(drop=True)
    if data_left.shape[0] == 0 or data_right.shape[0] == 0:
      if x_train.output.mean() >= 0.5:
        self.target = 1
      else:
        self.target = 2
      return
    if self.depth >= self.max_depth:
      if x_train.output.mean() >= 0.5:  
        self.target = 1
      else:
        self.target = 2
      return
    self.left = DecisionTree(self.depth+1,self.max_depth)
    self.left.train(data_left)
    self.right = DecisionTree(self.depth+1,self.max_depth)
    self.right.train(data_right)
    if x_train.output.mean() >= 0.5:  
      self.target = 1
    else:
      self.target = 2
    return
  def predict(self,test):
    if test[self.nkey] > self.mval:
      if self.right is None:
        return self.target
      return self.right.predict(test)
    if test[self.nkey] < self.mval:
      if self.left is None:
        return self.target
      return self.left.predict(test)

In [None]:
split = int(0.75*df.shape[0])
train_data = df[:split]
test_data = df[split:]
test_data = test_data.reset_index(drop = True)

In [None]:
dt = DecisionTree()

In [None]:
dt.train(train_data)

In [None]:
y_pred = []
for ix in range(test_data.shape[0]):
  y_pred.append(dt.predict(test_data.loc[ix]))

In [None]:
print("Accuracy: ",np.mean(y_pred == test_data['output'])*100)

Accuracy:  70.12987012987013


In [None]:
dt.nkey

'Nodes'

In [None]:
dt.right.nkey

'Age'

In [None]:
dt.left.nkey

'OP_year'