In [1]:
import math
from collections import Counter, defaultdict
from functools import partial
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, \
                            classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree

plt.style.use("fivethirtyeight")

### Dataset 1, Dummy dataset yaitu dataset yang berisi data kandidat yang diwawancarai yang berisikan informasi seperti level kandidat, bahasa pemrograman, dan lainnya, serta target variabel apakah kandidat tersebut diterima (True) atau tidak (False)

In [2]:
dummy_data = [
    ({"level": "Senior", "lang": "Java", "tweets": "no", "phd": "no"}, False),
    ({"level": "Senior", "lang": "Java", "tweets": "no", "phd": "yes"}, False),
    ({"level": "Mid", "lang": "Python", "tweets": "no", "phd": "no"}, True),
    ({"level": "Junior", "lang": "Python", "tweets": "no", "phd": "no"}, True),
    ({"level": "Junior", "lang": "R", "tweets": "yes", "phd": "no"}, True),
    ({"level": "Junior", "lang": "R", "tweets": "yes", "phd": "yes"}, False),
    ({"level": "Mid", "lang": "R", "tweets": "yes", "phd": "yes"}, True),
    ({"level": "Senior", "lang": "Python", "tweets": "no", "phd": "no"}, False),
    ({"level": "Senior", "lang": "R", "tweets": "yes", "phd": "no"}, True),
    ({"level": "Junior", "lang": "Python", "tweets": "yes", "phd": "no"}, True),
    ({"level": "Senior", "lang": "Python", "tweets": "yes", "phd": "yes"}, True),
    ({"level": "Mid", "lang": "Python", "tweets": "no", "phd": "yes"}, True),
    ({"level": "Mid", "lang": "Java", "tweets": "yes", "phd": "no"}, True),
    ({"level": "Junior", "lang": "Python", "tweets": "no", "phd": "yes"}, False),
]

In [3]:
print("Number of data:", len(dummy_data))

Number of data: 14


In [4]:
#Sample

sample = dummy_data[0]

print("features:", sample[0])
print("label:", sample[1])

features: {'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'no'}
label: False


In [5]:
#Unique Value

features = [pair[0] for pair in dummy_data]
levels = [feature["level"] for feature in features]
unique_levels = list(set(levels))

print("values in level:", levels)
print("unique values in level:", unique_levels)
print("number of unique values in level:", len(unique_levels))

values in level: ['Senior', 'Senior', 'Mid', 'Junior', 'Junior', 'Junior', 'Mid', 'Senior', 'Senior', 'Junior', 'Senior', 'Mid', 'Mid', 'Junior']
unique values in level: ['Mid', 'Junior', 'Senior']
number of unique values in level: 3


In [6]:
#Modus

dict_level_count = {}

for value in levels:
    if value not in dict_level_count.keys():
        dict_level_count[value] = 1
    else:
        dict_level_count[value] += 1

max_level_freq = max(dict_level_count.values())
modes = {
    level: mode
    for level, mode in dict_level_count.items()
    if mode == max_level_freq
}

print("freq of each values:", dict_level_count)
print("mode:", modes)

freq of each values: {'Senior': 5, 'Mid': 4, 'Junior': 5}
mode: {'Senior': 5, 'Junior': 5}


## Model

In [7]:
#DataDummy

def entropy(class_probabilites):
    "Calculate the entropy of the given list of class probabilites"
    return sum(
        -p * np.log2(p)
        for p in class_probabilites if p
    )


def class_probabilities(labels):
    "Calculate given class/target variable probabilites"
    total_count = len(labels)
    return [
        count / total_count
        for count in Counter(labels).values()
    ]


def data_entropy(labeled_data):
    "Calculate entropy of a feature with a corresponding labels"
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)


def partition_entropy(subsets):
    """Calculate entropy from partition of data into subsets.

    Subsets is a list of lists of labeled data.
    """
    total_count = sum(len(subset) for subset in subsets)
    return sum(
        data_entropy(subset) * len(subset) / total_count
        for subset in subsets
    )


def partition_by(inputs, attribute):
    """each input is a pair (attribute_dict, label).
    
    Returns a dict : feature_value -> inputs
    """
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute] # get the value of the specified attribute
        groups[key].append(input) # then add this input to the correct list
    return groups


def partition_entropy_by(inputs, attribute):
    """computes the entropy corresponding to the given partition"""
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())

In [8]:
label_entropy = data_entropy(dummy_data)
print("label entropy:", label_entropy)

print("entropy:")
for feature in ["level", "lang", "tweets", "phd"]:
    print(feature, ":", partition_entropy_by(dummy_data, feature))
    
#Ketika entrophy makin kecil maka ketidakpastiannya juga makin kecil sehingga menandakan kepastiannya besar

label entropy: 0.9402859586706311
entropy:
level : 0.6935361388961919
lang : 0.8601317128547441
tweets : 0.7884504573082896
phd : 0.8921589282623617


In [9]:
split_by_level = pd.DataFrame(
    [
     (value, labels)
     for features, labels in dummy_data
     for feat, value in features.items()
     if feat == "level"
    ],
    columns=["level", "label"]
)
split_by_level.groupby("level").agg({"label": list})

Unnamed: 0_level_0,label
level,Unnamed: 1_level_1
Junior,"[True, True, False, True, False]"
Mid,"[True, True, True, True]"
Senior,"[False, False, False, True, True]"


In [10]:
senior_inputs = [
    (input, label)
    for input, label in dummy_data
    if input["level"] == "Senior"
]
print("split by senior value:")
pprint(senior_inputs)

for feature in ['lang', 'tweets', 'phd']:
    print(feature, partition_entropy_by(senior_inputs, feature))

split by senior value:
[({'lang': 'Java', 'level': 'Senior', 'phd': 'no', 'tweets': 'no'}, False),
 ({'lang': 'Java', 'level': 'Senior', 'phd': 'yes', 'tweets': 'no'}, False),
 ({'lang': 'Python', 'level': 'Senior', 'phd': 'no', 'tweets': 'no'}, False),
 ({'lang': 'R', 'level': 'Senior', 'phd': 'no', 'tweets': 'yes'}, True),
 ({'lang': 'Python', 'level': 'Senior', 'phd': 'yes', 'tweets': 'yes'}, True)]
lang 0.4
tweets 0.0
phd 0.9509775004326938


In [11]:
split_by_phd = pd.DataFrame(
    [
     (value, labels)
     for features, labels in dummy_data
     for feat, value in features.items()
     if feat == "phd"
    ],
    columns=["phd", "label"]
)
split_by_phd.groupby("phd").agg({"label": list})

Unnamed: 0_level_0,label
phd,Unnamed: 1_level_1
no,"[False, True, True, True, False, True, True, T..."
yes,"[False, False, True, True, True, False]"


#### Setelah mencoba menghitung secara manual satu per satu fitur-fitur pada data dummy, kita akan membuat model Decision Tree yang akan menghitung nilai entropy seperti di atas untuk setiap fitur secara otomatis menggunakan fungsi berikut.

In [12]:
def build_tree(inputs, split_candidates=None):
    # if this is our first pass,
    # all keys of the first input are split candidates
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()

    # count Trues and Falses in the inputs
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues

    # no True(s)? return "False" leaf
    if num_trues == 0:
        return False
    # no False(s)? return "True" leaf
    if num_falses == 0:
        return True
    # if no split candidates left,
    # return the majority leaf
    if not split_candidates:
        return num_trues >= num_falses

    # otherwise, split on the best attribute
    best_attribute = min(
        split_candidates,
        key=partial(partition_entropy_by, inputs)
    )
    partitions = partition_by(inputs, best_attribute)
    new_candidates = [a for a in split_candidates if a != best_attribute]

    # recursively build the subtrees
    subtrees = {
        attribute_value: build_tree(subset, new_candidates)
        for attribute_value, subset in partitions.items()
    }
    subtrees[None] = num_trues > num_falses # default case
    return (best_attribute, subtrees)


def classify(tree, input):
    """classify the input using the given decision tree"""
    # if this is a leaf node, return its value
    if tree in [True, False]:
        return tree

    # otherwise this tree consists of an attribute to split on
    # and a dictionary whose keys are values of that attribute
    # and whose values of are subtrees to consider next
    attribute, subtree_dict = tree

    # None if input is missing attribute
    subtree_key = input.get(attribute)
    # if no subtree for key, we'll use the None subtree
    if subtree_key not in subtree_dict:
        subtree_key = None
    # choose the appropriate subtree, and use it to classify the input
    subtree = subtree_dict[subtree_key]
    return classify(subtree, input)

In [13]:
tree = build_tree(dummy_data)
tree

('level',
 {'Senior': ('tweets', {'no': False, 'yes': True, None: False}),
  'Mid': True,
  'Junior': ('phd', {'no': True, 'yes': False, None: True}),
  None: True})

In [14]:
for test, target in dummy_data:
    print(classify(tree, test), f"({target})")

False (False)
False (False)
True (True)
True (True)
True (True)
False (False)
True (True)
False (False)
True (True)
True (True)
True (True)
True (True)
True (True)
False (False)


In [15]:
sample = {
    "level": "Junior",
    "lang": "Python",
}
classify(tree, sample)

True