In [3]:
using DataFrames, MLDataUtils
using Clustering, Distances
using CSV
using Random
using Logging

┌ Info: Precompiling CSV [336ed68f-0bac-5ca0-87d4-7b16caf5d00b]
└ @ Base loading.jl:1186


In [4]:

# Set up Logging - we recommend to use this command to avoid package warnings during the model training process.
logger = Logging.SimpleLogger(stderr, Logging.Warn);
global_logger(logger);

#### Set parameters for the learners
cr = :dunnindex
method = "ICOT_local"
warm_start = :none;
geom_search = false
threshold = 0.0
seed = 1
gridsearch = false
num_tree_restarts = 100
complexity_c = 0.0
min_bucket = 10
maxdepth = 5

###### Step 1: Prepare the data
# Read the data - recommend the use of the (deprecated) readtable() command to avoid potential version conflicts with the CSV package.
# data = readtable("../data/ruspini.csv"); 
data = DataFrame(CSV.File("/home/sfy/Documents/VScodeProject/Thesis/data/ruspini.csv"))

# Convert the dataset to a matrix
# data_array = convert(Matrix{Float64}, data[:,:]);
data_array = Matrix(data)
# Get the number of observations and features
n, p = size(data_array)
data_t = data_array';

##### Step 2: Fit K-means clustering on the dataset to generate a warm-start for ICOT
#Fix the seed
Random.seed!(seed);

# The ruspini dataset has pre-defined clusters, which we will use to select the cluster count (K) for the K-means algorithm. 
# In an unsupervised setting (with no prior-known K), the number of clusters for K means can be selected using the elbow method.
K = length(unique(data_array[:, end]))

# Run k-means and save the assignments 
kmeans_result = kmeans(data_t, K);
assignment = kmeans_result.assignments;

data_full = DataFrame(hcat(data, assignment, makeunique=true));
names!(data_full, [:x1, :x2, :true_labels, :kmean_assign]);

# Prepare data for ICOT: features are stored in the matrix X, and the warm-start labels are stored in y
X = data_full[:, 1:2];
y = data_full[:, :true_labels];


│   caller = top-level scope at In[4]:44
└ @ Core In[4]:44


In [5]:
X

Unnamed: 0_level_0,x1,x2
Unnamed: 0_level_1,Float64,Float64
1,4.0,53.0
2,5.0,63.0
3,10.0,59.0
4,9.0,77.0
5,13.0,49.0
6,13.0,69.0
7,12.0,88.0
8,15.0,75.0
9,18.0,61.0
10,19.0,65.0


In [7]:
##### Step 3a. Before running ICOT, start by testing the IAI license
lnr_oct = ICOT.IAI.OptimalTreeClassifier(localsearch=false, max_depth=maxdepth,
    minbucket=min_bucket,
    criterion=:misclassification
)
grid = ICOT.IAI.GridSearch(lnr_oct)
ICOT.IAI.fit!(grid, X, y)
ICOT.IAI.showinbrowser(grid.lnr)


└ @ IAIBase /home/iai/.julia/packages/IAIBase/pOrUV/src/precompile.jl:19
│   caller = top-level scope at In[7]:8
└ @ Core In[7]:8


"/tmp/tmpmcL3A6/tree.html"

In [8]:

##### Step 3b. Run ICOT

# Run ICOT with no warm-start: 
warm_start = :none
lnr_ws_none = ICOT.InterpretableCluster(ls_num_tree_restarts=num_tree_restarts, ls_random_seed=seed, cp=complexity_c, max_depth=maxdepth,
    minbucket=min_bucket, criterion=cr, ls_warmstart_criterion=cr, kmeans_warmstart=warm_start,
    geom_search=geom_search, geom_threshold=threshold);
run_time_icot_ls_none = @elapsed ICOT.fit!(lnr_ws_none, X, y);

ICOT.showinbrowser(lnr_ws_none)


│   caller = ip:0x0
└ @ Core :-1
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:16[39m


Process(`[4mxdg-open[24m [4m/tmp/tmpANqkPG/tree.html[24m`, ProcessExited(0))

In [9]:
score_ws_none = ICOT.score(lnr_ws_none, X, y, criterion=:dunnindex);
score_al_ws_none = ICOT.score(lnr_ws_none, X, y, criterion=:silhouette);


│   caller = dunn_score(::Array{Float64,2}, ::Array{Int64,1}) at clustering_tree.jl:132
└ @ ICOT /home/iai/.julia/packages/ICOT/34UmY/src/clustering/clustering_tree.jl:132
Gtk-Message: 19:05:24.626: Not loading module "atk-bridge": The functionality is provided by GTK natively. Please try to not load it.


In [10]:

# Run ICOT with an OCT warm-start: fit an OCT as a supervised learning problem with labels "y" and use this as the warm-start
warm_start = :oct
lnr_ws_oct = ICOT.InterpretableCluster(ls_num_tree_restarts=num_tree_restarts, ls_random_seed=seed, cp=complexity_c, max_depth=maxdepth,
    minbucket=min_bucket, criterion=cr, ls_warmstart_criterion=cr, kmeans_warmstart=warm_start,
    geom_search=geom_search, geom_threshold=threshold);
run_time_icot_ls_oct = @elapsed ICOT.fit!(lnr_ws_oct, X, y);

score_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:dunnindex);
score_al_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:silhouette);


[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:02[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:18[39m
