In [418]:
import csv
import numpy as np

In [419]:
filename = "Druns.txt"

In [420]:
data = []
with open(filename, 'r') as file:
    reader = csv.reader(file, delimiter=' ')
    for row in reader:
        data.append([float(value) for value in row])

In [421]:
data = np.array(data)

In [422]:
def CalculateEntropy(data_label):
    _, counts = np.unique(data_label, return_counts=True)
    prob = counts/len(data_label)
    entropy = -np.sum(prob * np.log2(prob))
    return entropy

In [423]:
def splitEntropy(data,split,col):
    subgroup1 = data[data[:, col] < split]  
    subgroup2 = data[data[:, col] >= split] 
    proportion1 = subgroup1.shape[0]/data.shape[0]
    proportion2 = subgroup2.shape[0]/data.shape[0]
    entropy_split = - proportion1* np.log2(proportion1) - proportion2 * np.log2(proportion2)
    return entropy_split

In [424]:
def InfoGain(data,split,col):
    entropy_beforesplit = CalculateEntropy(data[:,2])
    subgroup1 = data[data[:, col] < split]  
    subgroup2 = data[data[:, col] >= split] 
    # print(split)
    # print(subgroup1)
    # print(subgroup2) 
    proportion1 = subgroup1.shape[0]/data.shape[0]
    proportion2 = subgroup2.shape[0]/data.shape[0]
    entropy_subgroup1 = CalculateEntropy(subgroup1[:,2])
    entropy_subgroup2 = CalculateEntropy(subgroup2[:,2])
    entropy_aftersplit = proportion1*entropy_subgroup1 + proportion2*entropy_subgroup2
    infogain = entropy_beforesplit - entropy_aftersplit
    return infogain

In [425]:
def GainRatio(data,split,col):
    infogain = InfoGain(data,split,col)
    entropy_split = splitEntropy(data,split,col)
    gainratio = infogain/entropy_split
    return gainratio

In [426]:
def DetermineCandidateSplits(data,col):
    C = []
    sorted_indices = np.argsort(data[:, col])
    sorted_data = data[sorted_indices]
    print(sorted_data)
    for i in range(len(sorted_data)-1):
        if sorted_data[i,2] != sorted_data[i+1,2]:
            infogain = InfoGain(sorted_data,sorted_data[i+1,col],col)
            # print(sorted_data[i+1,col],infogain)
            if infogain > 0:
                gainratio = GainRatio(sorted_data,sorted_data[i+1,col],col)
                gainratio = np.round(gainratio,4)
                print(f"For column {col} with cut {sorted_data[i+1,col]}: Gain Ratio = {gainratio}" )
                C.append(sorted_data[i+1,col])
            else:
                print(f"For column {col} with cut {sorted_data[i+1,col]}: Informatio gain = {infogain}, so this cut is skipped." )
    return C

In [427]:
C_1 = DetermineCandidateSplits(data,0)
C_2 = DetermineCandidateSplits(data,1)

[[ 0.  -1.   1. ]
 [ 0.   0.   0. ]
 [ 0.   1.   0. ]
 [ 0.   2.   0. ]
 [ 0.   3.   0. ]
 [ 0.   4.   0. ]
 [ 0.   5.   0. ]
 [ 0.   6.   1. ]
 [ 0.   7.   0. ]
 [ 0.   8.   1. ]
 [ 0.1 -2.   0. ]]
For column 0 with cut 0.0: Informatio gain = 0.0, so this cut is skipped.
For column 0 with cut 0.0: Informatio gain = 0.0, so this cut is skipped.
For column 0 with cut 0.0: Informatio gain = 0.0, so this cut is skipped.
For column 0 with cut 0.0: Informatio gain = 0.0, so this cut is skipped.
For column 0 with cut 0.1: Gain Ratio = 0.1005
[[ 0.1 -2.   0. ]
 [ 0.  -1.   1. ]
 [ 0.   0.   0. ]
 [ 0.   1.   0. ]
 [ 0.   2.   0. ]
 [ 0.   3.   0. ]
 [ 0.   4.   0. ]
 [ 0.   5.   0. ]
 [ 0.   6.   1. ]
 [ 0.   7.   0. ]
 [ 0.   8.   1. ]]
For column 1 with cut -1.0: Gain Ratio = 0.1005
For column 1 with cut 0.0: Gain Ratio = 0.056
For column 1 with cut 6.0: Gain Ratio = 0.2361
For column 1 with cut 7.0: Gain Ratio = 0.056
For column 1 with cut 8.0: Gain Ratio = 0.4302
