In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
filename = 'Druns.txt'
Data = pd.read_csv(filename, sep=' ', header=None)
Data.columns = ['x1', 'x2', 'y']
#Data

In [3]:
def entropy(data):
    if len(data) == 0:
        return 0
    d0 = len(data[data.loc[:, 'y'] == 0])
    d1 = len(data[data.loc[:, 'y'] == 1])
    p0 = d0/len(data)
    p1 = d1/len(data)
    if (p0 == 0) or (p1 == 0):
        H = 0 
    else:
        H = -p0 * math.log(p0, 2)- p1 * math.log(p1, 2)
    return H

In [4]:
def entropy_feature(len_left, len_right):
    p_l = len_left / (len_left + len_right)
    p_r = len_right / (len_left + len_right)
    if (p_l == 0) or (p_r == 0):
        H = 0 
    else:
        H = -p_l * math.log(p_l, 2)- p_r * math.log(p_r, 2)
    return H

In [8]:
def InfoGain(data, threshold, feature): 
    if len(data) == 0:
        return 0
    Data = data[[feature, 'y']]
    D_left = Data[Data[feature] >= threshold]
    D_right = Data[Data[feature] < threshold]
    p_left = len(D_left)/len(data)
    p_right = len(D_right)/len(data)
    HY = entropy(data)
    entropy_left = entropy(D_left)
    entropy_right = entropy(D_right)
    Gain = HY-(p_left * entropy_left + p_right * entropy_right)
    return Gain, len(D_left), len(D_right)

In [9]:
def all_candidate_splits(data):
    final_result = []
    for f in ("x1","x2"):
        #print(f)
        sorted_D = data.sort_values(by=[f])
        for i in range(len(data)):
            cuts = sorted_D.iloc[i][f]
            data_f = data[[f, 'y']]
            info_gain, len_left, len_right = InfoGain(data,cuts,f)
            HS = entropy_feature(len_left, len_right)
            if HS == 0:
                gain_ratio = info_gain
            else:
                gain_ratio = info_gain/HS
            #print(HS, info_gain, gain_ratio)
            result = f + ' >= ' + str(cuts) + ', info gain ratio = ' + str('%.4f' % gain_ratio)
            if result not in final_result:
                    final_result.append(result)
    return final_result 

In [11]:
all_candidate_splits(Data) 

['x1 >= 0.0, info gain ratio = 0.0000',
 'x1 >= 0.1, info gain ratio = 0.1005',
 'x2 >= -2.0, info gain ratio = 0.0000',
 'x2 >= -1.0, info gain ratio = 0.1005',
 'x2 >= 0.0, info gain ratio = 0.0560',
 'x2 >= 1.0, info gain ratio = 0.0058',
 'x2 >= 2.0, info gain ratio = 0.0011',
 'x2 >= 3.0, info gain ratio = 0.0164',
 'x2 >= 4.0, info gain ratio = 0.0497',
 'x2 >= 5.0, info gain ratio = 0.1112',
 'x2 >= 6.0, info gain ratio = 0.2361',
 'x2 >= 7.0, info gain ratio = 0.0560',
 'x2 >= 8.0, info gain ratio = 0.4302']

In [12]:
filename = 'D1.txt'
D1 = pd.read_csv(filename, sep=' ', header=None)
D1.columns = ['x1', 'x2', 'y']

In [17]:
s = all_candidate_splits(D1)
len(s)

2000