In [1]:
using DataFrames, MLDataUtils
using Clustering, Distances
using CSV
using Random
using Logging
using Printf

# Set up Logging - we recommend to use this command to avoid package warnings during the model training process.
logger = Logging.SimpleLogger(stderr, Logging.Warn);
global_logger(logger);

In [12]:
#### Set parameters for the learners
cr = :silhouette # VALIDATION criterion totrain ICOT
method = "ICOT_local"
warm_start = :oct; 
# in the paper,  awarmstart slution to intilazation algorithm, oct use kmeas labels as 
# as a uspervised algorithm. greedy option fits a cart tree to lebels? try? or not good as kmenas
geom_search = false 
# s a boolean parameter that controls where the algorithm will enable the geometric component of the feature space search. See details in Section 3.3.1 of the paper.
threshold = 0.99 # the percentile of gaps for each 
seed = 1
gridsearch = false
num_tree_restarts = 10
# is an integer specifying the number of random restarts to use in the local search algorithm. Must be positive and defaults to 100. The performance of the tree typically increases as this value is increased, but with quickly diminishing returns. The computational cost of training increases linearly with this value.
complexity_c = 0.0
min_bucket = 100


###### Step 1: Prepare the data
# Read the data - recommend the use of the (deprecated) readtable() command to avoid potential version conflicts with the CSV package.
dataset = readtable("dataset_noScaler.csv"); 
# println(dataset[1, :])


│   caller = top-level scope at In[12]:16
└ @ Core In[12]:16


In [3]:
println(dataset[1, :])

DataFrameRow
│ Row │ android_content_Context_getSystemService │ java_net_URL_openConnection │ android_os_PowerManager_WakeLock_release │ android_net_ConnectivityManager_getActiveNetworkInfo │ android_telephony_TelephonyManager_getDeviceId │ android_os_PowerManager_WakeLock_acquire │ android_intent_action_SENDTO │ android_location_LocationManager_getLastKnownLocation │ android_app_NotificationManager_notify │ android_media_Ringtone_play │ android_content_pm_PackageManager_getPackageInfo │ android_media_MediaPlayer_stop │ android_net_wifi_WifiManager_setWifiEnabled │ android_bluetooth_BluetoothDevice_createRfcommSocketToServiceRecord │ javax_crypto_Cipher_init │ javax_crypto_Cipher_getParameters │ android_widget_VideoView_start │ android_media_MediaPlayer_start │ javax_crypto_Cipher_getBlockSize │ android_widget_VideoView_setVideoPath │ javax_crypto_Cipher_getInstance │ android_os_Vibrator_vibrate │ android_permission_READ_CALL_LOG │ android_accounts_AccountManager_getAccountsByType │ ja

In [None]:
# data_x = select(dataset, Not(:label))
# data
#seperate x and y
true_label = dataset[:, end]
data_x = dataset[:, 1:end-1]

# train model 

data_array = convert(Matrix{Float64}, data_x);
# Get the number of observations and features
n, p = size(data_array)
data_t = data_array';

##### Step 2: Fit K-means clustering on the dataset to generate a warm-start for ICOT
#Fix the seed
Random.seed!(seed);

# The ruspini dataset has pre-defined clusters, which we will use to select the cluster count (K) for the K-means algorithm. 
# In an unsupervised setting (with no prior-known K), the number of clusters for K means can be selected using the elbow method.
K = 10

# Run k-means and save the assignments 
kmeans_result = kmeans(data_t, K);
assignment = kmeans_result.assignments;

# concat data and feed into next model
data_full = DataFrame(hcat(dataset, assignment, makeunique=true));

#rename column
if !hasproperty(data_full, :true_labels)
    rename!(data_full, :label => :true_labels)
end

if !hasproperty(data_full, :kmean_assign)
    rename!(data_full, :x1 => :kmean_assign)
else
    println("all done")
end

In [5]:
println(data_full[:1, :]) 

DataFrameRow
│ Row │ android_content_Context_getSystemService │ java_net_URL_openConnection │ android_os_PowerManager_WakeLock_release │ android_net_ConnectivityManager_getActiveNetworkInfo │ android_telephony_TelephonyManager_getDeviceId │ android_os_PowerManager_WakeLock_acquire │ android_intent_action_SENDTO │ android_location_LocationManager_getLastKnownLocation │ android_app_NotificationManager_notify │ android_media_Ringtone_play │ android_content_pm_PackageManager_getPackageInfo │ android_media_MediaPlayer_stop │ android_net_wifi_WifiManager_setWifiEnabled │ android_bluetooth_BluetoothDevice_createRfcommSocketToServiceRecord │ javax_crypto_Cipher_init │ javax_crypto_Cipher_getParameters │ android_widget_VideoView_start │ android_media_MediaPlayer_start │ javax_crypto_Cipher_getBlockSize │ android_widget_VideoView_setVideoPath │ javax_crypto_Cipher_getInstance │ android_os_Vibrator_vibrate │ android_permission_READ_CALL_LOG │ android_accounts_AccountManager_getAccountsByType │ ja

In [11]:
lnr_oct = ICOT.IAI.OptimalTreeClassifier(localsearch = false, max_depth = maxdepth,
													 minbucket = min_bucket,
													 criterion = :misclassification
													 )
grid = ICOT.IAI.GridSearch(lnr_oct)
ICOT.IAI.fit!(grid, X, y)
ICOT.IAI.showinbrowser(grid.lnr)

│   caller = top-level scope at In[11]:7
└ @ Core In[11]:7


"/tmp/tmpAkWa6Q/tree.html"

In [13]:
# Prepare data for ICOT: features are 
# stored in the matrix X, and the warm-start labels are stored in y
X = data_full[:, 1:end-2];
y = data_full[:, :true_labels];


maxdepth = 6 # can change ! use grid search for tuning!


# Run ICOT with an OCT warm-start: fit an OCT as a supervised learning problem with labels "y" and use this as the warm-start
# function icot(maxdepth)
@time begin
warm_start = :oct
lnr_ws_oct = ICOT.InterpretableCluster(ls_num_tree_restarts=num_tree_restarts, ls_random_seed=seed, cp=complexity_c, max_depth=maxdepth,
    minbucket=min_bucket, criterion=cr, ls_warmstart_criterion=cr, kmeans_warmstart=warm_start,
    geom_search=geom_search, geom_threshold=threshold)
run_time_icot_ls_oct = @elapsed ICOT.fit!(lnr_ws_oct, X, y)

score_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:dunnindex)
score_al_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:silhouette)
ICOT.showinbrowser(lnr_ws_oct)

@printf("dunnindex = %.4f\n", float(score_ws_oct))
@printf("silhouette = %.4f\n", float(score_al_ws_oct))

end
# end


[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:55[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:22[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:58[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:25[39m


In [9]:
# grid search for max max_depth
# unknown cannot use for gridsearch
for maxdepth in [4,5,6,7,8,9,10]
    @time begin
    @printf("MAXDEPTH = %1i\n", maxdepth)
    icot(maxdepth)
    end
end

MAXDEPTH = 4


[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:45[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:05[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:42[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:00[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:46[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:04[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:00:45[39m
[32mTraining trees...100%|██████████████████████████████████| Time: 0:01:14[39m


InterruptException: [91mInterruptException:[39m