In [1]:
using CSV, DataFrames, DecisionTree, StatsBase

In [2]:
data = CSV.read("contact_lens.csv", DataFrame)

Row,Age,Spectacle,Astigmatism,Tear,Class
Unnamed: 0_level_1,String15,String15,String15,String15,String15
1,young,myope,no,reduced,no lenses
2,young,myope,no,normal,soft
3,young,myope,yes,reduced,no lenses
4,young,myope,yes,normal,hard
5,young,hypermetrope,no,reduced,no lenses
6,young,hypermetrope,no,normal,soft
7,young,hypermetrope,yes,reduced,no lenses
8,young,hypermetrope,yes,normal,hard
9,pre-presbyopic,myope,no,reduced,no lenses
10,pre-presbyopic,myope,no,normal,soft


In [4]:
function encode_column(col)
    dict = Dict(unique(col) .=> 0:length(unique(col)) .- 1)
    return [dict[v] for v in col], dict
end

encode_column (generic function with 1 method)

In [5]:
encoded_data = DataFrame()

In [6]:
encoders = Dict()

Dict{Any, Any}()

In [7]:
for name in names(data)[1:end-1]
    encoded, mapping = encode_column(data[!, name])
    encoded_data[!, name] = encoded
    encoders[name] = mapping
end

In [8]:
encoded_class, class_map = encode_column(data[!, end])

([0, 1, 0, 2, 0, 1, 0, 2, 0, 1  …  0, 0, 0, 1, 0, 2, 0, 1, 0, 0], Dict{String15, Int64}(" hard" => 2, " no lenses" => 0, " soft" => 1))

In [9]:
y = Int.(encoded_class)

24-element Vector{Int64}:
 0
 1
 0
 2
 0
 1
 0
 2
 0
 1
 ⋮
 0
 0
 1
 0
 2
 0
 1
 0
 0

In [10]:
function my_entropy(y)
    n = length(y)
    counts = countmap(y)
    probs = [v / n for v in values(counts)]
    return -sum(p * log2(p) for p in probs)
end

my_entropy (generic function with 1 method)

In [11]:
total_entropy = my_entropy(y)

1.384431504340598

In [12]:
println("Total Entropy: ", total_entropy)

Total Entropy: 1.384431504340598


In [13]:
function information_gain(feature_col, y)
    n = length(y)
    parent_entropy = my_entropy(y)
    values = unique(feature_col)
    weighted_entropy = 0.0

    for val in values
        idx = findall(x -> x == val, feature_col)
        subset_y = y[idx]
        weight = length(subset_y) / n
        weighted_entropy += weight * my_entropy(subset_y)
    end

    return parent_entropy - weighted_entropy
end

information_gain (generic function with 1 method)

In [14]:
println("Information Gain Setiap Fitur:")
for name in names(encoded_data)
    gain = information_gain(encoded_data[!, name], y)
    println(" - ", name, ": ", round(gain, digits=4))
end

Information Gain Setiap Fitur:
 - Age           : 0.0186
 -  Spectacle   : 0.0401
 -  Astigmatism: 0.4253
 -  Tear   : 0.6549


In [15]:
X = convert(Matrix{Float64}, Matrix(encoded_data))

24×4 Matrix{Float64}:
 0.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0
 0.0  0.0  1.0  0.0
 0.0  0.0  1.0  1.0
 0.0  1.0  0.0  0.0
 0.0  1.0  0.0  1.0
 0.0  1.0  1.0  0.0
 0.0  1.0  1.0  1.0
 1.0  0.0  0.0  0.0
 1.0  0.0  0.0  1.0
 ⋮              
 1.0  1.0  1.0  1.0
 2.0  0.0  0.0  0.0
 2.0  0.0  0.0  1.0
 2.0  0.0  1.0  0.0
 2.0  0.0  1.0  1.0
 2.0  1.0  0.0  0.0
 2.0  1.0  0.0  1.0
 2.0  1.0  1.0  0.0
 2.0  1.0  1.0  1.0

In [16]:
model = build_tree(y, X, 4)

Decision Tree
Leaves: 5
Depth:  4

In [17]:
println("Decision Tree:")
feature_names = names(encoded_data)
print_tree(model, 5, feature_names=feature_names)

Decision Tree:
Feature 4: " Tear   " < 0.5 ?
├─ 0 : 12/12
└─ Feature 3: " Astigmatism" < 0.5 ?
    ├─ 1 : 6/6
    └─ Feature 2: " Spectacle   " < 0.5 ?
        ├─ 2 : 3/3
        └─ Feature 1: "Age           " < 0.5 ?
            ├─ 2 : 1/1
            └─ 0 : 2/2
