In [2]:
using DataFrames, MLDataUtils
using Clustering, Distances
using CSV
using Random
using Logging
using Printf

# Set up Logging - we recommend to use this command to avoid package warnings during the model training process.
logger = Logging.SimpleLogger(stderr, Logging.Warn);
global_logger(logger);

In [3]:
#### Set parameters for the learners
cr = :silhouette # VALIDATION criterion totrain ICOT
method = "ICOT_local"
warm_start = :oct; 
# in the paper,  awarmstart slution to intilazation algorithm, oct use kmeas labels as 
# as a uspervised algorithm. greedy option fits a cart tree to lebels? try? or not good as kmenas
geom_search = true 
# s a boolean parameter that controls where the algorithm will enable the geometric component of the feature space search. See details in Section 3.3.1 of the paper.
threshold = 0.99 # the percentile of gaps for each 
seed = 1
gridsearch = false
num_tree_restarts = 20
# is an integer specifying the number of random restarts to use in the local search algorithm. Must be positive and defaults to 100. The performance of the tree typically increases as this value is increased, but with quickly diminishing returns. The computational cost of training increases linearly with this value.
complexity_c = 0.0
min_bucket = 100
maxdepth = 5 # can change ! use grid search for tuning!

###### Step 1: Prepare the data
# Read the data - recommend the use of the (deprecated) readtable() command to avoid potential version conflicts with the CSV package.
dataset = readtable("/home/sfy/Documents/VScodeProject/Thesis/algorithms/dataset_Scaler.csv"); 
# println(dataset[1, :])


│   caller = top-level scope at In[3]:17
└ @ Core In[3]:17


In [4]:
# data_x = select(dataset, Not(:label))
# data
#seperate x and y
true_label = dataset[:, end]
data_x = dataset[:, 1:end-1]

# train model 

data_array = convert(Matrix{Float64}, data_x);
# Get the number of observations and features
n, p = size(data_array)
data_t = data_array';

##### Step 2: Fit K-means clustering on the dataset to generate a warm-start for ICOT
#Fix the seed
Random.seed!(seed);

# The ruspini dataset has pre-defined clusters, which we will use to select the cluster count (K) for the K-means algorithm. 
# In an unsupervised setting (with no prior-known K), the number of clusters for K means can be selected using the elbow method.
K = 10

# Run k-means and save the assignments 
kmeans_result = kmeans(data_t, K);
assignment = kmeans_result.assignments;

# concat data and feed into next model
data_full = DataFrame(hcat(dataset, assignment, makeunique=true));

#rename column
if !hasproperty(data_full, :true_labels)
    rename!(data_full, :label => :true_labels)
end

if !hasproperty(data_full, :kmean_assign)
    rename!(data_full, :x1 => :kmean_assign)
end

Unnamed: 0_level_0,android_content_Context_getSystemService,java_net_URL_openConnection,android_net_ConnectivityManager_getActiveNetworkInfo
Unnamed: 0_level_1,Float64⍰,Float64⍰,Float64⍰
1,-0.5042,-0.6994,-0.3466
2,-0.5042,-0.6994,-0.3466
3,-0.5042,-0.6994,-0.3466
4,-0.5042,-0.6994,-0.3466
5,-0.5042,-0.6994,-0.3466
6,-0.5042,-0.6994,-0.3466
7,-0.4686,-0.2915,-0.3466
8,-0.4686,-0.2915,-0.3466
9,-0.4686,-0.2915,-0.3466
10,-0.4686,-0.2915,-0.3466


In [5]:
println(data_full[:1, :]) 

DataFrameRow
│ Row │ android_content_Context_getSystemService │ java_net_URL_openConnection │ android_net_ConnectivityManager_getActiveNetworkInfo │ javax_crypto_Cipher_init │ android_content_pm_PackageManager_getPackageInfo │ android_telephony_TelephonyManager_getDeviceId │ android_net_wifi_WifiManager_setWifiEnabled │ android_permission_READ_CALL_LOG │ android_os_PowerManager_WakeLock_release │ android_provider_Settings_System_putInt │ java_lang_Throwable_printStackTrace │ android_net_wifi_WifiManager_getConnectionInfo │ android_net_wifi_WifiManager_WifiLock_acquire │ android_telephony_TelephonyManager_getCellLocation │ android_os_PowerManager_WakeLock_acquire │ android_widget_VideoView_start │ android_intent_action_SENDTO │ android_net_ConnectivityManager_getAllNetworkInfo │ android_permission_SEND_SMS │ android_media_MediaPlayer_start │ java_net_URLConnection_connect │ android_telephony_TelephonyManager_getSubscriberId │ javax_crypto_CipherOutputStream_close │ android_widget_VideoV

In [12]:
# Prepare data for ICOT: features are 
# stored in the matrix X, and the warm-start labels are stored in y
X = data_full[:, 1:end-2];
y = data_full[:, :true_labels];


# Run ICOT with an OCT warm-start: fit an OCT as a supervised learning problem with labels "y" and use this as the warm-start
function icot(maxdepth)
    warm_start = :oct
    lnr_ws_oct = ICOT.InterpretableCluster(ls_num_tree_restarts=num_tree_restarts, ls_random_seed=seed, cp=complexity_c, max_depth=maxdepth,
        minbucket=min_bucket, criterion=cr, ls_warmstart_criterion=cr, kmeans_warmstart=warm_start,
        geom_search=geom_search, geom_threshold=threshold)
    run_time_icot_ls_oct = @elapsed ICOT.fit!(lnr_ws_oct, X, y)

    score_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:dunnindex)
    score_al_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:silhouette)
    ICOT.showinbrowser(lnr_ws_oct)

    @printf("dunnindex = %.4f\n", float(score_ws_oct))
    @printf("silhouette = %.4f\n", float(score_al_ws_oct))

end


icot (generic function with 1 method)

In [8]:
# grid search for max max_depth
# unknown cannot use for gridsearch
for maxdepth in [4, 5, 6, 7, 8, 9, 10]
    @time begin
    @printf("MAXDEPTH = %1i\n", maxdepth)
    icot(maxdepth)
    end
end

MAXDEPTH = 4


│   caller = ip:0x0
└ @ Core :-1
└ @ IAIBase /home/iai/.julia/packages/IAIBase/pOrUV/src/precompile.jl:19
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:47[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:58[39m
│   caller = dunn_score(::Array{Float64,2}, ::Array{Int64,1}) at clustering_tree.jl:132
└ @ ICOT /home/iai/.julia/packages/ICOT/34UmY/src/clustering/clustering_tree.jl:132
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:39[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:01[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:37[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:00[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:41[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:04[39m
[32mTraining trees...100%|██████████████████████████████████| Tim

In [14]:
maxdepth = 3
@time begin
@printf("MAXDEPTH = %1i\n", maxdepth)
icot(maxdepth)
end

MAXDEPTH = 3


│   caller = ip:0x0
└ @ Core :-1
└ @ IAIBase /home/iai/.julia/packages/IAIBase/pOrUV/src/precompile.jl:19
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:26[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:26[39m
│   caller = dunn_score(::Array{Float64,2}, ::Array{Int64,1}) at clustering_tree.jl:132
└ @ ICOT /home/iai/.julia/packages/ICOT/34UmY/src/clustering/clustering_tree.jl:132
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:19[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:27[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:18[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:27[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:17[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:29[39m
[32mTraining trees...100%|██████████████████████████████████| Tim

dunnindex = 0

/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/sfy/snap/code/common/.cache/gio-modules/libgiolibproxy.so


.0054
silhouette = 0.4208
9091.263039 seconds (384.21 M allocations: 226.567 GiB, 0.83% gc time)


/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/sfy/snap/code/common/.cache/gio-modules/libgiolibproxy.so
libva error: vaGetDriverNameByIndex() failed with unknown libva error, driver_name = (null)


Opening in existing browser session.
