In [35]:
import pandas as pd
import math
import logging
import os

In [36]:
logger = logging.getLogger('DT_C45')

In [37]:
class Tree(object):
    root = None
    
    def __init__(self, root):
        self.root = root
    
    def print_tree(self):
        if self.root == None:
            return
        else:
            node = root.print_node()

In [38]:
class Node(object):
    _feature = None
    _label = None
    _is_discrete = None
    _is_leaf = None
    _child_nodes = None
    _value = None

    def __str__(self):
        return ('[%s, %s, %s, %s, %s, %s]' % (self._feature, self._value, self._is_discrete, self._label, self._is_leaf, self._child_nodes.keys()))
    
    @property
    def label(self):
        return self._label
    
    @label.setter
    def label(self, value):
        self._label = value

    @property
    def feature(self):
        return self._feature
    
    @feature.setter
    def feature(self, value):
        self._feature = value

    @property
    def value(self):
        return self._value
    
    @value.setter
    def value(self, value):
        self._value = value

    @property
    def is_leaf(self):
        return self._is_leaf
    
    @is_leaf.setter
    def is_leaf(self, value):
        self._is_leaf = value
    
    @property
    def is_discrete(self):
        return self._is_discrete
    
    @is_discrete.setter
    def is_discrete(self, value):
        self._is_discrete = value
    
    @property
    def child_nodes(self):
        return self._child_nodes

    def add_child(self, op, op_value, child_node):
        if self._is_discrete == False:
            self._value = op_value
        self._child_nodes[op] = child_node
    
    def _check_arg(self, key, kwargs):
        if key in kwargs:
            return kwargs[key]
        else:
            return None
    
    def __init__(self, **kwargs):
        self._feature = self._check_arg('feature', kwargs)
        self._is_discrete = self._check_arg('is_discrete', kwargs)
        self._is_leaf = self._check_arg('is_leaf', kwargs)
        self._label = self._check_arg('label', kwargs)
        self._value = self._check_arg('value', kwargs)
        self._child_nodes = {}
    
    def print_node(self):
        if self._is_leaf:
            logger.info('Leaf node - Label [%s]' % (self._label))
        else:
            for key in self._child_nodes.keys():
                logger.info('Node [%s] Discrete [%s], [%s][%s] -->' % (self._feature, self._is_discrete, key, self._value))
                self._child_nodes[key].print_node()
    
    def find_next_node(self, dataset, r_idx):
        if self._is_leaf == True:
            return None
        else:
            data = dataset.data
            val = data[self._feature][r_idx]
            if self._is_discrete == True:
                return self._child_nodes['=='+val]
            elif self._is_discrete == False:
                # support continuous data
                if val <= self._value:
                    return self._child_nodes['<=']
                else:
                    return self._child_nodes['>']
            else:
                pass

In [39]:
class Dataset(object):
    _data = None
    _column_properties = None
    
    def __init__(self, data, column_properties):
        self._data = data
        self._column_properties = column_properties
    
    def __len__(self):
        return len(self._data)
    
    @property
    def data(self):
        return self._data
    
    @data.setter
    def data(self, value):
        self._data = value
    
    @property
    def column_properties(self):
        return self._column_properties
    
    @column_properties.setter
    def column_properties(self, value):
        self._column_properties = value
    
    def column_property(self, key):
        return self._column_properties[key]

    def column_property_val(self, key, feature):
        return self._column_properties[key][feature]

In [40]:
def load_dataset():
    data = pd.DataFrame({'年龄':[24, 26, 23, 27, 34, 36, 36, 38, 40, 52, 67, 74, 67, 66, 65],
                         '年龄分类':['青年', '青年', '青年', '青年', '青年', '中年', '中年', '中年', '中年', '中年', '老年', '老年', '老年', '老年', '老年'], 
                         '有工作':['否', '否', '是', '是', '否', '否', '否', '是', '否', '否', '否', '否', '是', '是', '否'], 
                         '有房子':['否', '否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否', '否', '否'], 
                         '信贷情况':['一般', '好', '好', '一般', '一般', '一般', '好', '好', '非常好', '非常好', '非常好', '好', '好', '非常好', '一般'], 
                         '类别':['不通过', '不通过', '通过', '通过', '不通过', '不通过', '不通过', '通过', '通过', '通过', '通过', '通过', '通过', '通过', '不通过']})
    column_properties = {'is_discrete': {'年龄': False, 
                                          '年龄分类': True, 
                                          '有工作': True, 
                                          '有房子': True, 
                                          '信贷情况': True, 
                                          '类别': True}}
    
    return Dataset(data, column_properties)

In [41]:
def load_file(path=os.getcwd(), file='data.xlsx'):
    data = pd.read_excel(os.path.join(path, file))
    # TODO column autoconfig
    column_properties = {'is_discrete': {'年龄': False, 
                                          '年龄分类': True, 
                                          '有工作': True, 
                                          '有房子': True, 
                                          '信贷情况': True, 
                                          '类别': True}}
    
    return Dataset(data, column_properties)

In [42]:
def load_iris(path=os.getcwd(), file='iris.data'):
    data = pd.read_csv(os.path.join(path, file))
#     data = pd.DataFrame({'SepalLengthCm':[6.2, 6.0],
#                          'Species':['versicolor', 'virginica']})
    column_properties = {'is_discrete': {'SepalLengthCm': False, 
                                         'SepalWidthCm': False, 
                                         'PetalLengthCm': False, 
                                         'PetalWidthCm': False, 
                                         'Species': True}}
    return Dataset(data, column_properties)

In [43]:
def load_test_dataset():
    data = pd.DataFrame({'年龄':[25, 37], 
                         '年龄分类':['青年', '中年'], 
                         '有工作':['是', '是'], 
                         '有房子':['是', '是'], 
                         '信贷情况':['一般', '好'], 
                         '类别':['通过', '通过']})
    column_properties = {'is_discrete': {'年龄': False, 
                                          '年龄分类': True, 
                                          '有工作': True, 
                                          '有房子': True, 
                                          '信贷情况': True, 
                                          '类别': True}}
    
    return Dataset(data, column_properties)

In [44]:
def calc_entropy(dataset):
    data = dataset.data
    labels = data[data.columns[-1]]
    m = len(data)

    entropy = 0.
#     logging.debug('y_labels:\n%s' % labels.value_counts())
    for _, cnt in labels.value_counts().items():
        p = cnt / m
        entropy += - p * math.log(p, 2)
    
    return entropy

In [45]:
def choose_best_feature(dataset):
    data = dataset.data
    columns = data.columns[:-1]
    label = data.columns[-1]
    m = len(data)
    logging.debug('data:\n%s' % data)
    
    base_entropy = calc_entropy(dataset)
    info_gain_rate = 0.
    best_feature = None
    best_feature_val = None
        
    for feature in columns:
        split_info = 0.
        c_entropy = 0.
        info_gain_rate_temp = 0.
        split_point = None
        
        if dataset.column_property_val('is_discrete', feature) == True:
            # support discrete data
            unique_feature_vals = data[feature].unique()
            logging.debug('feature: %s, unique_val: %s' % (feature, unique_feature_vals))

            for unique_feature_val in unique_feature_vals:
                sub_data_temp = split_data(dataset, feature, None, unique_feature_val)
                m_sub = len(sub_data_temp)
                delta_c_entropy = m_sub / m * calc_entropy(sub_data_temp)
                logging.debug('%s: %s, [%d/%d], delta_c_entropy: %.6f, sub_data:\n%s' % (feature, unique_feature_val, m_sub, m, delta_c_entropy, sub_data_temp.data))
                c_entropy += delta_c_entropy

            logging.debug('feature: %s, total_c_entropy: %.6f' % (feature, c_entropy))

            # calculate split info
            for _, cnt in data[feature].value_counts().items():
                p = cnt / m
                split_info += - p * math.log(p, 2)
        else:
            # support continuous data
            sorted_data = data.sort_values(feature, ascending=True)
            unique_feature_vals = sorted_data[feature].unique()
            if len(unique_feature_vals) < 2:
                # c_entropy is zero, maxium info gain
                return feature, unique_feature_vals[0]
            elif len(unique_feature_vals) == 2:
                discrete_points = [(unique_feature_vals[0] + unique_feature_vals[1]) / 2]
            else:
                discrete_points = (unique_feature_vals[0:-2] + unique_feature_vals[1:-1]) / 2
                
            # pick maxium of info gain
            info_gain = 0.
            l_p = 0.
            r_p = 0.
            for pt in discrete_points:
                c_entropy = 0.

                l_sub_data_temp = split_data(dataset, feature, '<=', pt)
                m_l_sub = len(l_sub_data_temp)
                delta_c_entropy = m_l_sub / m * calc_entropy(l_sub_data_temp)
                logging.debug('%s: <=%s, [%d/%d], delta_c_entropy: %.6f, sub_data:\n%s' % (feature, pt, m_l_sub, m, delta_c_entropy, l_sub_data_temp.data))
                c_entropy += delta_c_entropy

                r_sub_data_temp = split_data(dataset, feature, '>', pt)
                m_r_sub = len(r_sub_data_temp)
                delta_c_entropy = m_r_sub / m * calc_entropy(r_sub_data_temp)
                logging.debug('%s: >%s, [%d/%d], delta_c_entropy: %.6f, sub_data:\n%s' % (feature, pt, m_r_sub, m, delta_c_entropy, r_sub_data_temp.data))
                c_entropy += delta_c_entropy

                info_gain_temp = base_entropy - c_entropy
                if info_gain_temp >= info_gain:
                    split_point = pt
                    info_gain = info_gain_temp
                    l_p = m_l_sub / m
                    r_p = m_r_sub / m
                
            # calculate split info
            for p in [l_p, r_p]:
                split_info += - p * math.log(p, 2)

        # if p is 0/1, make split_info non-zero
        if split_info == 0.0:
            split_info = - 0.99 * math.log(0.99, 2) - 0.01 * math.log(0.01, 2)

        # calculate info gain rate
        info_gain_rate_temp = (base_entropy - c_entropy) / split_info
        if info_gain_rate_temp > info_gain_rate:
            logging.debug('feature: %s, split_point: %s, info_gain_rate_temp: %.6f' % (feature, split_point, info_gain_rate_temp))
            info_gain_rate = info_gain_rate_temp
            best_feature = feature
            best_feature_val = split_point
    
    logging.info('best feature: %s, info gain rate: %.6f' % (best_feature, info_gain_rate))
    
    return best_feature, best_feature_val

In [46]:
def split_data(dataset, feature, op, value):
    data = dataset.data
    sub_data_temp = None
    if dataset.column_property_val('is_discrete', feature) == True:
        sub_data_temp = data[data[feature] == value].copy()
    else:
        if op == '<=':
            sub_data_temp = data[data[feature] <= value].copy()
        elif op == '>':
            sub_data_temp = data[data[feature] > value].copy()
        else:
            pass
    
    sub_data_temp.drop(feature, axis=1, inplace=True)
    return Dataset(sub_data_temp, dataset.column_properties)

In [47]:
def major_label(labels):
    value_counts = labels.value_counts().sort_values(ascending=False)
    logging.info('label conflict exists:\n%s' % (value_counts))
    return value_counts.index[0]

In [48]:
def create_tree(dataset):
    data = dataset.data
    labels = data[data.columns[-1]]
    unique_labels = labels.unique()
    
    # if only one unique label left, just pick this label
    if len(unique_labels) == 1:
        return Node(**{'label': unique_labels[0], 
                       'is_leaf': True})
    
    # if some conflicts on labels with same input, pick the label with highest probability
    if len(data.columns) == 1:
        return Node(**{'label': major_label(labels), 
                       'is_leaf': True})
    
    best_feature, best_feature_val = choose_best_feature(dataset)
    node = Node(**{'feature': best_feature, 
                   'is_discrete': dataset.column_property_val('is_discrete', best_feature), 
                   'is_leaf': False})
                
    if dataset.column_property_val('is_discrete', best_feature) == True:
        unique_feature_vals = data[best_feature].unique()
        for val in unique_feature_vals:
            sub_data = split_data(dataset, best_feature, None, val)
            child = create_tree(sub_data)
            node.add_child('==', val, child)
    else:
        l_sub_data = split_data(dataset, best_feature, '<=', best_feature_val)
        l_child = create_tree(l_sub_data)
        node.add_child('<=', best_feature_val, l_child)
        
        r_sub_data = split_data(dataset, best_feature, '>', best_feature_val)
        r_child = create_tree(r_sub_data)
        node.add_child('>', best_feature_val, r_child)
    
    return node

In [49]:
def fit(model, dataset):
    data = dataset.data
    labels = data[data.columns[-1]]
    m = len(data)
    
    pred_data = pd.DataFrame()
    for r_idx, _ in data.iterrows():
        node = model
        while True:
            if node.is_leaf:
                pred_data = pred_data.append(pd.DataFrame({'pred_labels': [node.label]}), ignore_index=True)
                break
            node = node.find_next_node(dataset, r_idx)
    
    results = pred_data[pred_data['pred_labels'] == labels]
    ratings = len(results) / m
    return ratings

In [51]:
if __name__ == '__main__':
    # logging setting
#     logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
    logging.basicConfig(level=logging.INFO, format='%(filename)s[%(lineno)d] - %(message)s')
    
    
    # model train
#     dataset = load_dataset()
    dataset = load_iris()
    base_entropy = calc_entropy(dataset)
    logging.info('base entropy: %s' % base_entropy)
    
    root = create_tree(dataset)
    root.print_node()
#     logging.info('model: %s' % root)
    
    # verify
    verify_data = dataset
    verify_ratings = fit(root, verify_data)
    logging.info('verify rating: %.2f%%' % (verify_ratings * 100))

    # test
#     test_data = load_test_dataset()
#     ratings = fit(root, test_data)
    
#     logging.debug('test_data:\n%s' % test_data.data)
#     logging.info('rating: %.2f%%' % (ratings * 100))

<ipython-input-51-503138a35b4a>[11] - base entropy: 1.584962500721156
<ipython-input-45-a5ea28f2c8dc>[91] - best feature: PetalWidthCm, info gain rate: 0.071696
<ipython-input-45-a5ea28f2c8dc>[91] - best feature: SepalWidthCm, info gain rate: 0.065394
<ipython-input-45-a5ea28f2c8dc>[91] - best feature: PetalLengthCm, info gain rate: 0.372607
<ipython-input-45-a5ea28f2c8dc>[91] - best feature: SepalLengthCm, info gain rate: 1.000000
<ipython-input-45-a5ea28f2c8dc>[91] - best feature: SepalLengthCm, info gain rate: 0.051887
<ipython-input-45-a5ea28f2c8dc>[91] - best feature: PetalLengthCm, info gain rate: 0.230552
<ipython-input-47-d5fddcdfd3b5>[3] - label conflict exists:
Iris-versicolor    25
Iris-virginica      1
Name: Species, dtype: int64
<ipython-input-47-d5fddcdfd3b5>[3] - label conflict exists:
Iris-virginica     9
Iris-versicolor    2
Name: Species, dtype: int64
<ipython-input-45-a5ea28f2c8dc>[91] - best feature: PetalLengthCm, info gain rate: 0.028631
<ipython-input-47-d5fddcdf