<a href="https://colab.research.google.com/github/kyle-gao/ML_ipynb/blob/master/DecisionTreeCART_from_scratch_with_pd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Copyright 2020 Yi Lin(Kyle) Gao

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 .


In [2]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [3]:
iris = datasets.load_iris()

In [4]:
x = iris.data
y = iris.target[:,np.newaxis]

In [5]:
columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', "Target"]
df = pd.DataFrame(np.concatenate((x,y),axis=1), columns =  columns)
df.Target = df.Target.astype('category')

In [6]:
def partition(df, column, value):
  """
  rows - a dataframe
  column - a column label, string or int
  value - if a float, then the question >= ? is asked. if categorical the question ==? is asked.
  Returns a list of rows for which the question is True and one for which question is False.
  """
  if df[column].dtype.name in ["category","object","bool"]:
    return df.loc[df[column]==value], df.loc[df[column]!=value]
  else:
    return df.loc[df[column]>=value], df.loc[df[column]<value]

In [7]:
def gini_inpurity(df):
  """
  input:
  df - a dataframe with the last column containing class labels
  returns: 
  the gini inpurity"""

  counts = df.iloc[:,-1].value_counts()
  inpurity = 1
  for label in counts.index:
    if df.iloc[:,-1].dtype.name == 'category' and isinstance(label,float):
      label = int(label)
      prob_label = counts.iloc[label]/counts.sum()
    else:
      prob_label = counts[label]/counts.sum() 
    inpurity = inpurity - prob_label**2
  return inpurity

In [8]:
def information_gain(left, right, current):
  """Returns the information gain of a node split"""

  p = float(len(left))/(len(left)+len(right))
  return current -p * gini_inpurity(left) - (1-p)*gini_inpurity(right)


In [9]:
#test
current = gini_inpurity(df)
left,right = partition(df,"Sepal Width", 3.0)
information_gain(left,right,current)

0.09771741180909241

In [10]:
def best_split(df):
  """
  Finds the best partition over the feature columns
  Input:
  df - a pd.Dataframe
  Returns:
  best_gain - information gain of best partition
  saved_col - the partition feature
  saved_value - the partition threshold/value
  """
  current = gini_inpurity(df)
  best_gain = 0
  saved_col = None
  saved_value = None

  for column in df.columns[:-1]:
    values = df[column]
    for value in values:
      # split the data
      left, right = partition(df, column, value)
      # skip the split if one of the splits is empty
      if len(left) == 0 or len(right) == 0:
        continue
      info_gain = information_gain(left, right, current)
      if info_gain > best_gain:
        best_gain = info_gain
        saved_col = column
        saved_value = value
  return best_gain, saved_col, saved_value

In [11]:
training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]
df_train = pd.DataFrame(training_data, columns = ["Color", "Number", "Target"])

In [12]:
class Leaf:
  def __init__(self, df):

    #a dictionary of counts of target classes in the Leaf's branch
    self.predictions = df.iloc[:,-1].value_counts().to_dict()
    self.__sum = np.asarray(list(self.predictions.values())).astype(float).sum()
    #normalize the counts to return a probability
    self.predict = {key:str(value/self.__sum * 100)+"%" for (key,value) in self.predictions.items()}

In [13]:
class Node:
  def __init__(self, col, value, left, right):
    """
    Inputs:
    col - a dataframe column index
    value - a value in the column
    left - a dataframe
    right - a dataframe
    """

    self.threshold = (col,value)
    self.left = left
    self.right = right

In [14]:
def build_tree(df, depth = 0, max_depth = None):
  """Recursively build the tree from df"""
  
  gain, col, val = best_split(df)

  #base cases:
  #reach Leaf node
  #max depth is reached
  if gain == 0:
    return Leaf(df)
  if isinstance(max_depth,int) and depth >= max_depth:
    return Leaf(df)
  
  left, right = partition(df, col, val)

  #recursive calls
  left_branch = build_tree(left, depth + 1, max_depth)
  right_branch = build_tree(right, depth + 1, max_depth)
  return Node(col,val, left_branch, right_branch)



In [15]:
def print_tree(node, df, spacing= ""):
    """Recursively prints the tree from df """

    #base case: node is Leaf
    if isinstance(node,Leaf):
        print (spacing + "Predict", node.predict)
        return

    (col,val) = node.threshold
    
    if df[col].dtype.name in ["category","object","bool"]:
      print( df[col].dtype.name )
      print (spacing + str(col)+"=="+str(val)+"?")
    else:
      print (spacing + str(col)+">="+str(val)+"?")     


    #recursive calls  
    print (spacing + '--> True:')
    print_tree(node.left, df, spacing + "  ")
    print (spacing + '--> False:')
    print_tree(node.right, df, spacing + "  ")


In [16]:
tree = build_tree(df)

In [17]:
print_tree(tree, df)

Petal Length>=3.0?
--> True:
  Petal Width>=1.8?
  --> True:
    Petal Length>=4.9?
    --> True:
      Predict {2.0: '100.0%', 1.0: '0.0%', 0.0: '0.0%'}
    --> False:
      Sepal Length>=6.0?
      --> True:
        Predict {2.0: '100.0%', 1.0: '0.0%', 0.0: '0.0%'}
      --> False:
        Predict {1.0: '100.0%', 2.0: '0.0%', 0.0: '0.0%'}
  --> False:
    Petal Length>=5.0?
    --> True:
      Petal Width>=1.6?
      --> True:
        Sepal Length>=7.2?
        --> True:
          Predict {2.0: '100.0%', 1.0: '0.0%', 0.0: '0.0%'}
        --> False:
          Predict {1.0: '100.0%', 2.0: '0.0%', 0.0: '0.0%'}
      --> False:
        Predict {2.0: '100.0%', 1.0: '0.0%', 0.0: '0.0%'}
    --> False:
      Petal Width>=1.7?
      --> True:
        Predict {2.0: '100.0%', 1.0: '0.0%', 0.0: '0.0%'}
      --> False:
        Predict {1.0: '100.0%', 2.0: '0.0%', 0.0: '0.0%'}
--> False:
  Predict {0.0: '100.0%', 2.0: '0.0%', 1.0: '0.0%'}


In [18]:
def classify(row, node):
    "Recursively follow the tree"
    if isinstance(node, Leaf):
        return node.predict
    col, val = node.threshold

    if row[col].dtype.name == "float64":
      if row[col]>=val:
        return classify(row, node.left)
      else:
        return classify(row, node.right)
    else:
      if row[col]==val:
        return classify(row, node.left)
      else:
        return classify(row, node.right)

In [19]:
classify(df.iloc[0],tree)

{0.0: '100.0%', 1.0: '0.0%', 2.0: '0.0%'}