In [1]:
import numpy as np
import pandas as pd
import math
import logging
import os
import uuid
import graphviz as gv
import itertools

In [2]:
logger = logging.getLogger('RANDOM_FOREST_CART')
column_property_dict = ['is_discrete']

In [3]:
class Base(object):
    _uuid = None
    _name = None

    @property
    def uuid(self):
        return self._uuid
    
    @property
    def name(self):
        return self._name
    
    def _check_arg(self, key, default=None, **kwargs):
        if key in kwargs:
            return kwargs[key]
        else:
            return default
    
    def __init__(self, **kwargs):
        self._uuid = uuid.uuid4()
        self._name = self._check_arg('name', **kwargs)

In [4]:
class RandomForest(Base):
    _dataset = None
    _k = None
    _frac = None
    _samples = []
    _forest = []
    
    def __init__(self, dataset, frac=0.7, k=3, **kwargs):
        super(RandomForest, self).__init__(**kwargs)
        self._dataset = dataset
        self._k = k
        self._frac = 0.7
    
    def generate_forest(self):
        for i in range(self._k):
            sample = dataset.sample(frac=self._frac)
            self._samples.append(sample)
            
            dt = Tree()
            dt.generate_tree(sample)
            dt.print_tree()
            self._forest.append(dt)
    
    def predict(self, dataset):
        pred = []
        for tree in self._forest:
            print(tree.predict(dataset))
    
    def rating(self):
        rating = []
        for tree in self._forest:
            rating.append(tree.rating(self._dataset))
        return math.fsum([x / self._k for x in rating])

In [5]:
class Tree(Base):
    _root = None
    
    def __init__(self, **kwargs):
        super(Tree, self).__init__(**kwargs)
    
    def generate_tree(self, dataset):
        self._root = self._build_node(dataset)
        
    def _build_node(self, dataset):
        data = dataset.data
        labels = data[data.columns[-1]]
        unique_labels = labels.unique()

        # if only one unique label left, just pick this label
        if len(unique_labels) == 1:
            return Node(**{'label': unique_labels[0], 
                           'is_leaf': True})

        # if some conflicts on labels with same input, pick the label with highest probability
        if len(data.columns) == 1:
            return Node(**{'label': self._major_label(labels), 
                           'is_leaf': True})

        best_feature, best_feature_val = self._choose_best_feature(dataset)
        node = Node(**{'feature': best_feature,
                       'value': best_feature_val,
                       'is_discrete': dataset.column_property_val(column_property_dict[0], best_feature),
                       'is_leaf': False})

        l_sub_data, r_sub_data = self._split_data(dataset, best_feature, best_feature_val)
        node.left_node = self._build_node(l_sub_data)
        node.right_node = self._build_node(r_sub_data)

        return node
    
    def _choose_best_feature(self, dataset):
        data = dataset.data
        columns = data.columns[:-1]
        label = data.columns[-1]
        m = len(data)
        logging.debug('data:\n%s' % data)

        gini = 1.
        best_feature = None
        best_feature_val = None

        for feature in columns:
            gini_temp = 0.

            if dataset.column_property_val(column_property_dict[0], feature) == True:
                # support discrete data
                groups = self._feature_split_groups(dataset, feature)
                for group in groups:
                    gini_temp = self._calc_group_gini(dataset, feature, group)
                    if gini_temp < gini:
                        gini = gini_temp
                        best_feature = feature
                        best_feature_val = group
            else:
                # support continuous data
                sorted_data = data.sort_values(feature, ascending=True)
                unique_feature_vals = sorted_data[feature].unique()
                if len(unique_feature_vals) < 2:
                    # c_entropy is zero, maxium info gain
                    return feature, unique_feature_vals[0]
                elif len(unique_feature_vals) == 2:
                    discrete_points = [(unique_feature_vals[0] + unique_feature_vals[1]) / 2]
                else:
                    discrete_points = (unique_feature_vals[0:-2] + unique_feature_vals[1:-1]) / 2

                for pt in discrete_points:
                    gini_temp = 0.
                    l_gini_temp = 1.
                    l_sub_data_temp, r_sub_data_temp = self._split_data(dataset, feature, pt)

                    m_l_sub = len(l_sub_data_temp)
                    for _, cnt in l_sub_data_temp.data[label].value_counts().items():
                        l_gini_temp = l_gini_temp - math.pow(cnt / m_l_sub, 2)
                    gini_temp = gini_temp + m_l_sub / m * l_gini_temp

                    r_gini_temp = 1.
                    m_r_sub = len(r_sub_data_temp)
                    for _, cnt in r_sub_data_temp.data[label].value_counts().items():
                        r_gini_temp = r_gini_temp - math.pow(cnt / m_r_sub, 2)
                    gini_temp = gini_temp + m_r_sub / m * r_gini_temp

                    if gini_temp < gini:
                        gini = gini_temp
                        best_feature = feature
                        best_feature_val = pt

        logging.info('best feature: %s, gini: %.6f' % (best_feature, gini))

        return best_feature, best_feature_val

    def _split_data(self, dataset, feature, value):
        data = dataset.data
        l_sub_data_temp = None
        r_sub_data_temp = None
        if dataset.column_property_val(column_property_dict[0], feature) == True:
            # type(value) = tuple(tuple(), tuple())
            l_sub_data_temp = data[data[feature].isin(value[0])].copy()
            r_sub_data_temp = data[data[feature].isin(value[1])].copy()
        else:
            # type(value) = float
            l_sub_data_temp = data[data[feature] <= value].copy()
            r_sub_data_temp = data[data[feature] > value].copy()

        l_sub_data_temp.drop(feature, axis=1, inplace=True)
        r_sub_data_temp.drop(feature, axis=1, inplace=True)

        return Dataset(l_sub_data_temp, dataset.column_properties), Dataset(r_sub_data_temp, dataset.column_properties)

    def _major_label(self, labels):
        value_counts = labels.value_counts().sort_values(ascending=False)
        logging.info('label conflict exists:\n%s' % (value_counts))
        return value_counts.index[0]

    def _feature_split_groups(self, dataset, feature):
        data = dataset.data
        uni_vals = data[feature].unique()
        m = len(uni_vals)
        c = []
        for i in range(1, m):
            c.extend(itertools.combinations(uni_vals, i))
        c_m = len(c)

        return zip(c[:int(c_m/2)], c[:int(c_m/2-1):-1])

    def _calc_group_gini(self, dataset, feature, group):
        data = dataset.data
        m = len(data)

        # len(group) = 2; D1, D2
        gini = [1.] * len(group)
        for i in range(len(group)):
            d = data[data[feature].isin(group[i])]
            unique_d_labels = d[data.columns[-1]].unique()

            for _, cnt in d[data.columns[-1]].value_counts().items():
                gini[i] = gini[i] - math.pow(cnt / len(d), 2)
            gini[i] = len(d) / m * gini[i]

        return math.fsum(gini)
    
    def predict(self, dataset):
        data = dataset.data
        m = len(data)

        pred_data = pd.DataFrame()
        for r_idx, _ in data.iterrows():
            node = self._root
            while True:
                if node.is_leaf:
                    pred_data = pred_data.append(pd.DataFrame({'pred_labels': [node.label]}), ignore_index=True)
                    break
                node = node.find_next_node(dataset, r_idx)
        return pred_data
    
    def rating(self, dataset):
        data = dataset.data
        m = len(data)
        labels = data[data.columns[-1]]
        
        pred_data = self.predict(dataset)
        results = pred_data[pred_data['pred_labels'] == labels]
        rating = len(results) / m
        return rating

    def print_tree(self):
        self._root.print_node()
    
    def gen_graph(self):
        edge_data = ''
        edges = self._root.child_nodes_to_dot()
        for edge in edges:
            edge_data += edge
        dot_data = 'digraph edge_settings {%s}' % edge_data
        graph = gv.Source(dot_data)
        graph.render()

        logging.info('generate graph')

In [6]:
class Node(Base):
    _feature = None
    _value = None
    _label = None
    _is_discrete = None
    _is_leaf = None
    _left_node = None
    _right_node = None
        
    def __init__(self, **kwargs):
        super(Node, self).__init__(**kwargs)
        self._feature = self._check_arg('feature', **kwargs)
        self._is_discrete = self._check_arg('is_discrete', **kwargs)
        self._is_leaf = self._check_arg('is_leaf', **kwargs)
        self._label = self._check_arg('label', **kwargs)
        self._value = self._check_arg('value', **kwargs)
    
    @property
    def label(self):
        return self._label
    
    @label.setter
    def label(self, value):
        self._label = value

    @property
    def feature(self):
        return self._feature
    
    @feature.setter
    def feature(self, value):
        self._feature = value

    @property
    def value(self):
        return self._value
    
    @value.setter
    def value(self, value):
        self._value = value

    @property
    def is_leaf(self):
        return self._is_leaf
    
    @is_leaf.setter
    def is_leaf(self, value):
        self._is_leaf = value
    
    @property
    def is_discrete(self):
        return self._is_discrete
    
    @is_discrete.setter
    def is_discrete(self, value):
        self._is_discrete = value
    
    @property
    def left_node(self):
        return self._left_node
    
    @left_node.setter
    def left_node(self, value):
        self._left_node = value

    @property
    def right_node(self):
        return self._right_node
    
    @right_node.setter
    def right_node(self, value):
        self._right_node = value
    
    def _print_node(self):
        if self._is_leaf:
            logger.info('Label Node [%s]' % (self._label))
        else:
            logger.info('Decision Node [%s], Value [%s]' % (self._feature, str(self._value)))
    
    def print_node(self):
        self._print_node()
        if self._left_node:
            self._left_node.print_node()
        if self._right_node:
            self._right_node.print_node()
    
    def _node_to_dot(self):
        edges = []
        if not self._is_leaf:
            if self._is_discrete == True:
                if self._left_node.is_leaf:
                    edges.append('"%s\n[%s]" -> "%s\n[%s]"[label="in %s"];' % (self._feature, self._uuid, self._left_node.label, self._left_node.uuid, self._value[0]))
                else:
                    edges.append('"%s\n[%s]" -> "%s\n[%s]"[label="in %s"];' % (self._feature, self._uuid, self._left_node.feature, self._left_node.uuid, self._value[0]))
                
                if self._right_node.is_leaf:
                    edges.append('"%s\n[%s]" -> "%s\n[%s]"[label="in %s"];' % (self._feature, self._uuid, self._right_node.label, self._right_node.uuid, self._value[1]))
                else:
                    edges.append('"%s\n[%s]" -> "%s\n[%s]"[label="in %s"];' % (self._feature, self._uuid, self._right_node.feature, self._right_node.uuid, self._value[1]))
            else:
                if self._left_node.is_leaf:
                    edges.append('"%s\n[%s]" -> "%s\n[%s]"[label="<= %s"];' % (self._feature, self._uuid, self._left_node.label, self._left_node.uuid, str(self._value)))
                else:
                    edges.append('"%s\n[%s]" -> "%s\n[%s]"[label="<= %s"];' % (self._feature, self._uuid, self._left_node.feature, self._left_node.uuid, str(self._value)))
                
                if self._right_node.is_leaf:
                    edges.append('"%s\n[%s]" -> "%s\n[%s]"[label="> %s"];' % (self._feature, self._uuid, self._right_node.label, self._right_node.uuid, str(self._value)))
                else:
                    edges.append('"%s\n[%s]" -> "%s\n[%s]"[label="> %s"];' % (self._feature, self._uuid, self._right_node.feature, self._right_node.uuid, str(self._value)))
        return edges
    
    def child_nodes_to_dot(self):
        all_edges = []
        all_edges.extend(self._node_to_dot())
        if self._left_node:
            all_edges.extend(self._left_node.child_nodes_to_dot())
        if self._right_node:
            all_edges.extend(self._right_node.child_nodes_to_dot())
        return all_edges
    
    def find_next_node(self, dataset, r_idx):
        if self._is_leaf == True:
            return None
        else:
            data = dataset.data
            val = data[self._feature][r_idx]
            if self._is_discrete == True:
                if val in self._value[0]:
                    return self._left_node
                elif val in self._value[1]:
                    return self._right_node
                else:
                    pass
            else:
                # support continuous data
                if val <= self._value:
                    return self._left_node
                else:
                    return self._right_node

In [7]:
class Dataset(object):
    _data = None
    _column_properties = None
    
    def __init__(self, data, column_properties):
        self._data = data
        self._column_properties = column_properties
    
    def __len__(self):
        return len(self._data)
    
    @property
    def data(self):
        return self._data
    
    @data.setter
    def data(self, value):
        self._data = value
    
    @property
    def column_properties(self):
        return self._column_properties
    
    @column_properties.setter
    def column_properties(self, value):
        self._column_properties = value
    
    def column_property(self, key):
        return self._column_properties[key]

    def column_property_val(self, key, feature):
        return self._column_properties[key][feature]
    
    def sample(self, frac=0.7):
        return Dataset(self._data.sample(frac=frac), self._column_properties)

In [8]:
def load_dataset():
    return self.load_file()

def load_file(path=os.getcwd(), file='discrete_data.xlsx'):
    data = pd.read_excel(os.path.join(path, file), sheet_name=0)

    col_data = pd.read_excel(os.path.join(path, file), sheet_name=1)
    # skip 'column_property_dict' and 'label' columns
    columns = col_data.columns[1:-1]
    column_properties = {}
    for r_idx, row in col_data.iterrows():
        config = {}
        for col in columns:
            config[col] = row[col]
        column_properties[column_property_dict[r_idx]] = config
    return Dataset(data, column_properties)

def load_iris(path=os.getcwd(), file='iris.data'):
    data = pd.read_csv(os.path.join(path, file))
    column_properties = {column_property_dict[0]: {'SepalLengthCm': False, 
                                         'SepalWidthCm': False, 
                                         'PetalLengthCm': False, 
                                         'PetalWidthCm': False, 
                                         'Species': True}}
    return Dataset(data, column_properties)

def load_test_dataset():
    data = pd.DataFrame({'年龄':[25, 37], 
                         '年龄分类':['青年', '中年'], 
                         '有工作':['是', '是'], 
                         '有房子':['是', '是'], 
                         '信贷情况':['一般', '好'], 
                         '类别':['通过', '通过']})
    column_properties = {column_property_dict[0]: {'年龄': False, 
                                          '年龄分类': True, 
                                          '有工作': True, 
                                          '有房子': True, 
                                          '信贷情况': True, 
                                          '类别': True}}

    return Dataset(data, column_properties)

In [10]:
if __name__ == '__main__':
    # logging setting
#     logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
    logging.basicConfig(level=logging.INFO, format='%(filename)s[%(lineno)d] - %(message)s')
    
    
    # model train
#     dataset = load_dataset()
    dataset = load_iris()
#     sample = dataset.sample()
    
    rf = RandomForest(dataset=dataset, k=3)
    rf.generate_forest()

    # graph
#     dt.gen_graph()
    
    # verify
    rf.predict(dataset)
#     verify_ratings = rf.rating(dataset)
#     logging.info('verify rating: %.2f%%' % (verify_ratings * 100))
    
    # test
#     test_data = load_test_dataset()
#     test_ratings = dt.rating(test_data)
#     logging.info('test rating: %.2f%%' % (test_ratings * 100))

<ipython-input-5-cfc7a7ae464c>[93] - best feature: PetalLengthCm, gini: 0.318977
<ipython-input-5-cfc7a7ae464c>[93] - best feature: PetalWidthCm, gini: 0.085208
<ipython-input-5-cfc7a7ae464c>[93] - best feature: SepalLengthCm, gini: 0.083983
<ipython-input-5-cfc7a7ae464c>[93] - best feature: SepalWidthCm, gini: 0.056818
<ipython-input-5-cfc7a7ae464c>[117] - label conflict exists:
Iris-versicolor    15
Iris-virginica      1
Name: Species, dtype: int64
<ipython-input-5-cfc7a7ae464c>[93] - best feature: SepalWidthCm, gini: 0.000000
<ipython-input-5-cfc7a7ae464c>[93] - best feature: SepalLengthCm, gini: 0.053571
<ipython-input-5-cfc7a7ae464c>[93] - best feature: SepalWidthCm, gini: 0.142857
<ipython-input-5-cfc7a7ae464c>[117] - label conflict exists:
Iris-versicolor    1
Iris-virginica     1
Name: Species, dtype: int64
<ipython-input-6-0f6249bcdba6>[78] - Decision Node [PetalLengthCm], Value [2.45]
<ipython-input-6-0f6249bcdba6>[76] - Label Node [Iris-setosa]
<ipython-input-6-0f6249bcdba6>

         pred_labels
0        Iris-setosa
1        Iris-setosa
2        Iris-setosa
3        Iris-setosa
4        Iris-setosa
5        Iris-setosa
6        Iris-setosa
7        Iris-setosa
8        Iris-setosa
9        Iris-setosa
10       Iris-setosa
11       Iris-setosa
12       Iris-setosa
13       Iris-setosa
14       Iris-setosa
15       Iris-setosa
16       Iris-setosa
17       Iris-setosa
18       Iris-setosa
19       Iris-setosa
20       Iris-setosa
21       Iris-setosa
22       Iris-setosa
23       Iris-setosa
24       Iris-setosa
25       Iris-setosa
26       Iris-setosa
27       Iris-setosa
28       Iris-setosa
29       Iris-setosa
..               ...
120   Iris-virginica
121   Iris-virginica
122   Iris-virginica
123   Iris-virginica
124   Iris-virginica
125   Iris-virginica
126   Iris-virginica
127   Iris-virginica
128   Iris-virginica
129  Iris-versicolor
130   Iris-virginica
131   Iris-virginica
132   Iris-virginica
133  Iris-versicolor
134  Iris-versicolor
135   Iris-vi