In [1]:
import pandas as pd
import math
import logging

In [2]:
logger = logging.getLogger('DT_C45')

In [3]:
class Tree(object):
    root = None
    
    def __init__(self, root):
        self.root = root
    
    def print_tree(self):
        if self.root == None:
            return
        else:
            node = root.print_node()

In [4]:
class Node(object):
    _feature = None
    _label = None
    _is_discreted = None
    _is_leaf = None
    _child_nodes = None

    def __str__(self):
        return ('[%s, %s, %s, %s, %s]' % (self._feature, self._is_discreted, self._label, self._is_leaf, self._child_nodes.keys()))
        
    @property
    def label(self):
        return self._label
    
    @label.setter
    def label(self, value):
        self._label = value

    @property
    def feature(self):
        return self._feature
    
    @feature.setter
    def feature(self, value):
        self._feature = value

    @property
    def is_leaf(self):
        return self._is_leaf
    
    @is_leaf.setter
    def is_leaf(self, value):
        self._is_leaf = value
    
    @property
    def is_discreted(self):
        return self._is_discreted
    
    @is_discreted.setter
    def is_discreted(self, value):
        self._is_discreted = value
    
    @property
    def child_nodes(self):
        return self._child_nodes

    def add_child(self, op, op_value, child_node):
        self._child_nodes[op + op_value] = child_node
    
    def _check_arg(self, key, kwargs):
        if key in kwargs:
            return kwargs[key]
        else:
            return None
    
    def __init__(self, **kwargs):
        self._feature = self._check_arg('feature', kwargs)
        self._is_discreted = self._check_arg('is_discreted', kwargs)
        self._is_leaf = self._check_arg('is_leaf', kwargs)
        self._label = self._check_arg('label', kwargs)
        self._child_nodes = {}
    
    def print_node(self):
        if self._is_leaf:
            logger.debug('Leaf node - Label [%s]' % (self._label))
        else:
            for key in self._child_nodes.keys():
                logger.debug('Node [%s] Discreted [%s], [%s] -->' % (self._feature, self._is_discreted, key))
                self._child_nodes[key].print_node()
    
    def find_next_node(self, dataset, r_idx):
        if self._is_leaf == True:
            return None
        else:
            data = dataset.data
            val = data[self._feature][r_idx]
            if self._is_discreted:
                return self._child_nodes['=='+val]
            else:
                # TODO non discreted
                pass

In [5]:
class Dataset(object):
    _data = None
    _column_properties = None
    
    def __init__(self, data, column_properties):
        self._data = data
        self._column_properties = column_properties
    
    def __len__(self):
        return len(self._data)
    
    @property
    def data(self):
        return self._data
    
    @data.setter
    def data(self, value):
        self._data = value
    
    @property
    def column_properties(self):
        return self._column_properties
    
    @column_properties.setter
    def column_properties(self, value):
        self._column_properties = value
    
    def column_property(self, key):
        return self._column_properties[key]

    def column_property_val(self, key, feature):
        return self._column_properties[key][feature]

In [6]:
def load_dataset():
    data = pd.DataFrame({'年龄':[24, 26, 23, 27, 34, 36, 36, 38, 40, 52, 67, 74, 67, 66, 65],
                         '年龄分类':['青年', '青年', '青年', '青年', '青年', '中年', '中年', '中年', '中年', '中年', '老年', '老年', '老年', '老年', '老年'], 
                         '有工作':['否', '否', '是', '是', '否', '否', '否', '是', '否', '否', '否', '否', '是', '是', '否'], 
                         '有房子':['否', '否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否', '否', '否'], 
                         '信贷情况':['一般', '好', '好', '一般', '一般', '一般', '好', '好', '非常好', '非常好', '非常好', '好', '好', '非常好', '一般'], 
                         '类别':['不通过', '不通过', '通过', '通过', '不通过', '不通过', '不通过', '通过', '通过', '通过', '通过', '通过', '通过', '通过', '不通过']})
    column_properties = {'is_discreted': {#'年龄': False, 
                                          '年龄分类': True, 
                                          '有工作': True, 
                                          '有房子': True, 
                                          '信贷情况': True, 
                                          '类别': True}}
    
    return Dataset(data, column_properties)

In [7]:
def load_test_dataset():
    data = pd.DataFrame({'年龄':[25, 37], 
                         '年龄分类':['青年', '中年'], 
                         '有工作':['是', '是'], 
                         '有房子':['是', '是'], 
                         '信贷情况':['一般', '好'], 
                         '类别':['通过', '通过']})
    column_properties = {'is_discreted': {'年龄': False, 
                                          '年龄分类': True, 
                                          '有工作': True, 
                                          '有房子': True, 
                                          '信贷情况': True, 
                                          '类别': True}}
    
    return Dataset(data, column_properties)

In [8]:
def calc_entropy(dataset):
    data = dataset.data
    labels = data[data.columns[-1]]
    m = len(data)

    entropy = 0.
#     logging.debug('y_labels:\n%s' % labels.value_counts())
    for _, cnt in labels.value_counts().items():
        p = cnt / m
        entropy += - p * math.log(p, 2)
    
    return entropy

In [9]:
def choose_best_feature(dataset):
    data = dataset.data
    columns = data.columns[:-1]
    m = len(data)
    logging.debug('data:\n%s' % data)
    
    base_entropy = calc_entropy(dataset)
    info_gain_rate = 0.
    best_feature = None
    best_feature_val = None
    
    for feature in columns:
        # C4.5 spileinfo
        # discreted
        spilt_info = 0.
        if dataset.column_property_val('is_discreted', feature) == True:
            for _, cnt in data[feature].value_counts().items():
                p = cnt / m
                spilt_info += - p * math.log(p, 2)
        
            unique_feature_vals = data[feature].unique()
            logging.debug('feature: %s, unique_val: %s' % (feature, unique_feature_vals))

            c_entropy = 0.
            for unique_feature_val in unique_feature_vals:
                sub_data_temp = split_data(dataset, feature, unique_feature_val)
                m_sub = len(sub_data_temp)
                delta_c_entropy = m_sub / m * calc_entropy(sub_data_temp)
                logging.debug('%s: %s, [%d/%d], delta_c_entropy: %.6f, sub_data:\n%s' % (feature, unique_feature_val, m_sub, m, delta_c_entropy, sub_data_temp.data))
                c_entropy += delta_c_entropy

            logging.debug('feature: %s, total_c_entropy: %.6f' % (feature, c_entropy))

            # I(Y|X) = H(Y) - H(Y|X), as H(Y) is fixed in the same data set, to get the biggest I
            info_gain_rate_temp = (base_entropy - c_entropy) / spilt_info
        else:
            # non-discreted
            sorted_data = data.sort_values(feature, ascending=True)
            unique_feature_vals = sorted_data[feature].unique()
            if len(unique_feature_vals) < 2:
                # TODO
                print('TODO: len(unique_feature_vals) < 2')
            elif len(unique_feature_vals) == 2:
                discreted_points = [(unique_feature_vals[0] + unique_feature_vals[1]) / 2]
            else:
                discreted_points = (unique_feature_vals[0:-2] + unique_feature_vals[1:-1]) / 2
                for point in discreted_points:
                    
                    pass
#                 print(discreted_points)
        
        if info_gain_rate_temp > info_gain_rate:
            logging.debug('feature: %s, info_gain_rate_temp: %.6f' % (feature, info_gain_rate_temp))
            info_gain_rate = info_gain_rate_temp
            best_feature = feature
    
    logging.info('best feature: %s, info gain rate: %.6f' % (best_feature, info_gain_rate))
    
    return best_feature, best_feature_val

In [10]:
def split_data(dataset, feature, value):
    data = dataset.data
    sub_data_temp = data[data[feature] == value].copy()
    sub_data_temp.drop(feature, axis=1, inplace=True)
    return Dataset(sub_data_temp, dataset.column_properties)

In [11]:
def major_label(labels):
    value_counts = labels.value_counts().sort_values(ascending=False)
    logging.info('label conflict exists:\n%s' % (value_counts))
    return value_counts.index[0]

In [12]:
def create_tree(dataset):
    data = dataset.data
    labels = data[data.columns[-1]]
    unique_labels = labels.unique()
    
    # if only one unique label left, just pick this label
    if len(unique_labels) == 1:
        return Node(**{'label': unique_labels[0], 
                       'is_leaf': True})
    
    # if some conflicts on labels with same input, pick the label with highest probability
    if len(data.columns) == 1:
        return Node(**{'label': major_label(labels), 
                       'is_leaf': True})
    
    best_feature, best_feature_val = choose_best_feature(dataset)
    node = Node(**{'feature': best_feature, 
                   'is_discreted': dataset.column_property_val('is_discreted', best_feature), 
                   'is_leaf': False})
                
    unique_feature_vals = data[best_feature].unique()
    if dataset.column_property_val('is_discreted', best_feature) == True:
        op = '=='
    else:
        op = '[UNKNOWN]'
        
    for val in unique_feature_vals:
        child = create_tree(split_data(dataset, best_feature, val))
        node.add_child(op, val, child)
    
    return node

In [13]:
def fit(model, dataset):
    data = dataset.data
    labels = data[data.columns[-1]]
    m = len(data)
    
    pred_data = pd.DataFrame()
    for r_idx, _ in data.iterrows():
        node = model
        while True:
            if node.is_leaf:
                pred_data = pred_data.append(pd.DataFrame({'pred_labels': [node.label]}), ignore_index=True)
                break
            node = node.find_next_node(dataset, r_idx)
    
    results = pred_data[pred_data['pred_labels'] == labels]
    ratings = len(results) / m
    return ratings

In [14]:
if __name__ == '__main__':
    # logging setting
#     logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
    logging.basicConfig(level=logging.INFO, format='%(filename)s[%(lineno)d] - %(message)s')
    
    
    # model train
    dataset = load_dataset()
    base_entropy = calc_entropy(dataset)
    logging.info('base entropy: %s' % base_entropy)
    
    root = create_tree(dataset)
    root.print_node()
#     logging.info('model: %s' % root)
    
    # test
#     test_data = load_test_dataset()
#     ratings = fit(root, test_data)
    
#     logging.debug('test_data:\n%s' % test_data.data)
#     logging.info('rating: %.2f%%' % (ratings * 100))

<ipython-input-14-87df49c82c05>[10] - base entropy: 0.9709505944546686
<ipython-input-9-5842f9ea170f>[55] - best feature: 有房子, info gain rate: 0.432538
<ipython-input-9-5842f9ea170f>[55] - best feature: 有工作, info gain rate: 1.000000
<ipython-input-14-87df49c82c05>[21] - rating: 100.00%
