In [41]:
import pandas as pd
import math
import logging

In [42]:
logger = logging.getLogger('DT_C45')

In [43]:
def load_data():
    data = pd.DataFrame({'年龄':['青年', '青年', '青年', '青年', '青年', '中年', '中年', '中年', '中年', '中年', '老年', '老年', '老年', '老年', '老年'], 
                         '有工作':['否', '否', '是', '是', '否', '否', '否', '是', '否', '否', '否', '否', '是', '是', '否'], 
                         '有房子':['否', '否', '否', '是', '否', '否', '否', '是', '是', '是', '是', '是', '否', '否', '否'], 
                         '信贷情况':['一般', '好', '好', '一般', '一般', '一般', '好', '好', '非常好', '非常好', '非常好', '好', '好', '非常好', '一般'], 
                         '类别':['不通过', '不通过', '通过', '通过', '不通过', '不通过', '不通过', '通过', '通过', '通过', '通过', '通过', '通过', '通过', '不通过']})
    return data

In [44]:
def load_test_data():
    data = pd.DataFrame({'年龄':['青年', '中年'], 
                         '有工作':['是', '是'], 
                         '有房子':['有', '有'], 
                         '信贷情况':['一般', '好'], 
                         '类别':['通过', '通过']})
    return data

In [45]:
def calc_entropy(data):
    labels = data[data.columns[-1]]
    m = len(data)

#     cnt_labels = {}
    entropy = 0
#     logging.debug('y_labels:\n%s' % labels.value_counts())
    for _, cnt in labels.value_counts().items():
#         cnt_labels[label] = cnt
        p = cnt / m
        entropy += - p * math.log(p, 2)
    
#     logging.debug('entropy: %s' % entropy)
    return entropy

In [52]:
def choose_best_feature(data):
    columns = data.columns[:-1]
    m = len(data)
    logging.debug('data:\n%s' % data)
    
    base_entropy = calc_entropy(data)
    info_gain_rate = 0.
    best_feature = None
    for feature in columns:
        # C4.5 spileinfo
        spilt_info = 0.
        for _, cnt in data[feature].value_counts().items():
            p = cnt / m
            spilt_info += - p * math.log(p, 2)
        
        unique_feature_vals = data[feature].unique()
        logging.debug('feature: %s, unique_val: %s' % (feature, unique_feature_vals))

        c_entropy = 0.
        for unique_feature_val in unique_feature_vals:
            sub_data_temp = split_data(data, feature, unique_feature_val)
            m_sub = len(sub_data_temp)
            delta_c_entropy = m_sub / m * calc_entropy(sub_data_temp)
            logging.debug('%s: %s, [%d/%d], delta_c_entropy: %.6f, sub_data:\n%s' % (feature, unique_feature_val, m_sub, m, delta_c_entropy, sub_data_temp))
            c_entropy += delta_c_entropy
        
        logging.debug('feature: %s, total_c_entropy: %.6f' % (feature, c_entropy))
        
        # I(Y|X) = H(Y) - H(Y|X), as H(Y) is fixed in the same data set, to get the biggest I
        info_gain_rate_temp = (base_entropy - c_entropy) / spilt_info
        
        if info_gain_rate_temp > info_gain_rate:
            logging.debug('feature: %s, info_gain_rate_temp: %.6f' % (feature, info_gain_rate_temp))
            info_gain_rate = info_gain_rate_temp
            best_feature = feature
    
    logging.info('best feature: %s, info gain rate: %.6f' % (best_feature, info_gain_rate))
    
    return best_feature

In [47]:
def split_data(data, feature, value):
    sub_data_temp = data[data[feature] == value].copy()
    sub_data_temp.drop(feature, axis=1, inplace=True)
    return sub_data_temp

In [48]:
def major_label(labels):
    value_counts = labels.value_counts().sort_values(ascending=False)
    logging.info('label conflict exists:\n%s' % (value_counts))
    return value_counts.index[0]

In [49]:
def create_tree(data):
    labels = data[data.columns[-1]]
    unique_labels = labels.unique()
    
    # if only one unique label left, just pick this label
    if len(unique_labels) == 1:
        return unique_labels[0]
    
    # if some conflicts on labels with same input, pick the label with highest probability
    if len(data.columns) == 1:
        return major_label(labels)
    
    best_feature = choose_best_feature(data)
    tree = {best_feature: {}}
    
    unique_feature_vals = data[best_feature].unique()
    for val in unique_feature_vals:
        tree[best_feature][val] = create_tree(split_data(data, best_feature, val))
    
    return tree

In [50]:
def fit(model, data):
    labels = data[data.columns[-1]]
    m = len(data)
    
    pred_data = pd.DataFrame()
    for row in data.iterrows():
        node = model
        while True:
            if type(node) != dict:
                pred_data = pred_data.append(pd.DataFrame({'pred_labels': [node]}), ignore_index=True)
                break
                
            for k in node.keys():
                feature = k
                break
            
            feature_vals = node[feature].keys()
            for feature_val in feature_vals:
                # row is a tuple of (#, pandas series)
                if str(row[-1][feature]) == feature_val:
                    node = node[feature][feature_val]
                    break
    
    results = pred_data[pred_data['pred_labels'] == labels]
    ratings = len(results) / m
    return ratings

In [53]:
if __name__ == '__main__':
    # logging setting
#     logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
    logging.basicConfig(level=logging.INFO, format='%(filename)s[%(lineno)d] - %(message)s')
    # model train
    data = load_data()
    base_entropy = calc_entropy(data)
    logging.info('base entropy: %s' % base_entropy)
    
    model = create_tree(data)
    logging.info('model: %s' % model)
    
    # test
#     test_data = load_test_data()
#     ratings = fit(model, test_data)
#     print('------------')
#     print(ratings)
#     logging.debug('test_data:\n%s' % test_data)
#     logging.info('rating: %.2f%%' % (ratings * 100))

<ipython-input-53-a24b5b9c725b>[8] - base entropy: 0.9709505944546686
<ipython-input-52-86646c0847f7>[37] - best feature: 有房子, info gain rate: 0.432538
<ipython-input-52-86646c0847f7>[37] - best feature: 有工作, info gain rate: 1.000000
<ipython-input-53-a24b5b9c725b>[11] - model: {'有房子': {'否': {'有工作': {'否': '不通过', '是': '通过'}}, '是': '通过'}}
