<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from typing import List
import math

def entropy(class_probabilities: List[float]) -> float:
  """Given a list of class probabilities, compute the entropy"""
  return sum(-p * math.log(p,2)
              for p in class_probabilities
             if p > 0) # ignore zero probabilities

assert entropy([1.0]) == 0
assert entropy([0.5, 0.5]) == 1
assert 0.81 < entropy([0.25, 0.75]) < 0.82


In [None]:
from typing import Any
from collections import Counter

def class_probabilities(labels: List[Any]) -> List[float]:
  total_count = len(labels)
  return [count / total_count
          for count in Counter(labels).values()]

def data_entropy(labels: List[Any]) -> float:
  return entropy(class_probabilities(labels))

assert data_entropy(['a']) == 0
assert data_entropy([True, False]) == 1
assert data_entropy([3, 4, 4, 4]) == entropy([0.25, 0.75])

def partition_entropy(subsets: List[List[Any]]) -> float:
  """Returns the entropy form this partition of data into subsets"""
  total_count = sum(len(subset) for subset in subsets)

  return sum(data_entropy(subset) * len(subset) / total_count
             for subset in subsets)

In [None]:
from typing import NamedTuple, Optional

class Candidate(NamedTuple):
  level: str
  lang: str
  tweets: bool
  phd: bool
  did_well: Optional[bool] = None # allow unlabeled data

           #  level     lang     tweets  phd  did_well
inputs = [Candidate('Senior', 'Java',   False, False, False),
          Candidate('Senior', 'Java',   False, True,  False),
          Candidate('Mid',    'Python', False, False, True),
          Candidate('Junior', 'Python', False, False, True),
          Candidate('Junior', 'R',      True,  False, True),
          Candidate('Junior', 'R',      True,  True,  False),
          Candidate('Mid',    'R',      True,  True,  True),
          Candidate('Senior', 'Python', False, False, False),
          Candidate('Senior', 'R',      True,  False, True),
          Candidate('Junior', 'Python', True,  False, True),
          Candidate('Senior', 'Python', True,  True,  True),
          Candidate('Mid',    'Python', False, True,  True),
          Candidate('Mid',    'Java',   True,  False, True),
          Candidate('Junior', 'Python', False, True,  False)
         ]

In [None]:
from typing import Dict, TypeVar
from collections import defaultdict

T = TypeVar('T') # generic type for inputs

def partition_by(inputs: List[T], attribute: str) -> Dict[Any, List[T]]:
  """Partition the inputs into lists based on the specified attribute."""
  partitions: Dict[Any, List[T]] = defaultdict(list)
  for input in inputs:
    key = getattr(input, attribute) # value of the specified attribute
    partitions[key].append(input) # add input to the correct partition
  return partitions

def partition_entropy_by(inputs: List[Any],
                         attribute: str,
                         label_attribute: str) -> float:
  """Compute the entropy correspionding to the given partition"""
  # partitions consist of our inputs
  partitions = partition_by(inputs, attribute)

  # but partition_entropy needs just the class labels
  labels = [[getattr(input, label_attribute) for input in partition]
            for partition in partitions.values()]
  
  return partition_entropy(labels)

In [None]:
for key in ['level', 'lang', 'tweets', 'phd']:
  print(key, partition_entropy_by(inputs, key, 'did_well'))

assert 0.69 < partition_entropy_by(inputs, 'level', 'did_well') < 0.70
assert 0.86 < partition_entropy_by(inputs, 'lang', 'did_well') < 0.87
assert 0.78 < partition_entropy_by(inputs, 'tweets', 'did_well') < 0.79
assert 0.89 < partition_entropy_by(inputs, 'phd', 'did_well') < 0.90

level 0.6935361388961919
lang 0.8601317128547441
tweets 0.7884504573082896
phd 0.8921589282623617


In [None]:
senior_inputs = [input for input in inputs if input.level == 'Senior']

assert 0.4 == partition_entropy_by(senior_inputs, 'lang', 'did_well')
assert 0.0 == partition_entropy_by(senior_inputs, 'tweets', 'did_well')
assert 0.95 < partition_entropy_by(senior_inputs, 'phd', 'did_well')

In [None]:
from typing import NamedTuple, Union, Any

class Leaf(NamedTuple):
  value: Any

class Split(NamedTuple):
  attribute: str
  subtrees: dict
  default_value: Any = None

DecisionTree = Union[Leaf, Split]
hiring_tree = Split('level', {  # first, consider "level"
    'Junior': Split('phd', {    # if level is Junior, next look at phd
        False: Leaf(True),      # if phd is false, predict true
        True: Leaf(False)       # if phd is true, predict false
    }),
    'Mid': Leaf(True),           # if level is mid, just predict true
    'Senior': Split('tweets', {  # if level is sewnior, look at tweets
        False: Leaf(False),       # if tweets is false, predict false
        True: Leaf(True)
  })
})

In [None]:
def classify(tree: DecisionTree, input: Any) -> Any:
  """Classify the input using the given decision tree"""
  # If this is a leaf node, return its value
  if isinstance(tree, Leaf):
    return tree.value

  # Otherwise this tree consists of an attribute to split on
  # and a dictionary whose keys are values of that attribute
  # and whose values are subtrees to consider next
  subtree_key = getattr(input, tree.attribute)

  if subtree_key not in tree.subtrees:  # If no subtree for key,
    return tree.default_value           # return the default value

  subtree = tree.subtrees[subtree_key]  # Choose the appropriate subtree
  return classify(subtree, input)       # and use it to classify the input.
  

In [None]:
def build_tree_id3(inputs: List[Any],
                   split_attributes: List[str],
                   target_attribute: str) -> DecisionTree:
    # Count target labels
    label_counts = Counter(getattr(input, target_attribute)
                            for input in inputs)
    most_common_label = label_counts.most_common(1)[0][0]

    # If there's a unique label, predict it
    if len(label_counts) == 1:
      return Leaf(most_common_label)

    # If no split attributes left, return the majority label
    if not split_attributes:
      return Leaf(most_common_labels)

# Otherwise split by the best attribute
