In [2]:
using DataFrames, MLDataUtils
using Clustering, Distances
using CSV
using Random
using Logging

# Set up Logging - we recommend to use this command to avoid package warnings during the model training process.
logger = Logging.SimpleLogger(stderr, Logging.Warn);
global_logger(logger);

In [3]:
#### Set parameters for the learners
cr = :silhouette
method = "ICOT_local"
warm_start = :none; # ???????
geom_search = false ## ?????
threshold = 0.0
seed = 1
gridsearch = false
num_tree_restarts = 100
complexity_c = 0.0
min_bucket = 10
maxdepth = 5

###### Step 1: Prepare the data
# Read the data - recommend the use of the (deprecated) readtable() command to avoid potential version conflicts with the CSV package.
# data = readtable("../data/ruspini.csv"); 


In [None]:
data = DataFrame(CSV.File("/home/sfy/Documents/VScodeProject/Thesis/algorithms/alldata.csv"))

In [5]:
size(data)

(41257, 697)

In [6]:


# Convert the dataset to a matrix
data_array = Matrix(data)
# Get the number of observations and features
n, p = size(data_array)
data_t = data_array';

##### Step 2: Fit K-means clustering on the dataset to generate a warm-start for ICOT
#Fix the seed
Random.seed!(seed);
K = 2

# Run k-means and save the assignments 
kmeans_result = kmeans(data_t, K);
assignment = kmeans_result.assignments;


In [7]:
data

Unnamed: 0_level_0,android.permission.CAMERA,android.permission.READ_CONTACTS,android.permission.WRITE_CONTACTS
Unnamed: 0_level_1,Int64,Int64,Int64
1,1,0,0
2,1,0,0
3,0,0,0
4,0,0,0
5,1,0,0
6,0,0,0
7,0,0,0
8,0,0,0
9,1,0,0
10,0,0,0


In [21]:

data_full = DataFrame(hcat(data, assignment, makeunique=true));
# names!(data_full, [:x1, :x2, :true_labels, :kmean_assign]);
# X = data_full[:, 1:2];
# y = data_full[:, :true_labels];

# Prepare data for ICOT: features are stored in the matrix X, and the warm-start labels are stored in y

# get x and y
X = select(data,Not([:y]))
y = data[:,:y]

41257-element Array{Int64,1}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 1
 1
 1
 1
 1
 1
 1
 1
 1

In [11]:
##### Step 3a. Before running ICOT, start by testing the IAI license
lnr_oct = ICOT.IAI.OptimalTreeClassifier(localsearch=false, max_depth=maxdepth,
    minbucket=min_bucket,
    criterion=:misclassification
)
grid = ICOT.IAI.GridSearch(lnr_oct)
ICOT.IAI.fit!(grid, X, y)

GridSearch - Unfitted OptimalTreeClassifier:
  max_depth:   5
  minbucket:   10
  localsearch: false

GridSearch Params:
  ()

In [22]:
ICOT.IAI.fit!(grid, X, y)

└ @ IAIBase /home/iai/.julia/packages/IAIBase/pOrUV/src/precompile.jl:19


All Grid Results:

│ Row │ cp          │ train_score │ valid_score │ rank_valid_score │
│     │ [90mFloat64[39m     │ [90mFloat64[39m     │ [90mFloat64[39m     │ [90mInt64[39m            │
├─────┼─────────────┼─────────────┼─────────────┼──────────────────┤
│ 1   │ 0.000104544 │ 0.881337    │ 0.884706    │ 1                │

Best Params:
  cp => 0.00010454418734318072

Best Model - Fitted OptimalTreeClassifier:
  1) Split: android.permission.READ_PHONE_STATE < 0.5
    2) Split: android.intent.category.HOME < 0.5
      3) Split: android.permission.SEND_SMS < 0.5
        4) Split: android/provider/Settings$Secure;setLocationProviderEnabled < 0.5
          5) Split: android/net/wifi/WifiManager;getConfiguredNetworks < 3.5
            6) Predict: 0 (96.89%), [11686,375], 12061 points, error 375
            7) Predict: 1 (61.54%), [5,8], 13 points, error 5
          8) Predict: 1 (97.78%), [1,44], 45 points, error 1
        9) Split: android/net/ConnectivityManager;getActiveNetwork

In [23]:
ICOT.IAI.showinbrowser(grid.lnr)

│   caller = top-level scope at none:0
└ @ Core none:0


"/tmp/tmpoEajSx/tree.html"

In [24]:

##### Step 3b. Run ICOT

# Run ICOT with no warm-start: 
warm_start = :none
lnr_ws_none = ICOT.InterpretableCluster(ls_num_tree_restarts=num_tree_restarts, ls_random_seed=seed, cp=complexity_c, max_depth=maxdepth,
    minbucket=min_bucket, criterion=cr, ls_warmstart_criterion=cr, kmeans_warmstart=warm_start,
    geom_search=geom_search, geom_threshold=threshold);
run_time_icot_ls_none = @elapsed ICOT.fit!(lnr_ws_none, X, y);

ICOT.showinbrowser(lnr_ws_none)
# 30 mins without results


In [None]:
score_ws_none = ICOT.score(lnr_ws_none, X, y, criterion=:dunnindex);
score_al_ws_none = ICOT.score(lnr_ws_none, X, y, criterion=:silhouette);

In [None]:

# Run ICOT with an OCT warm-start: fit an OCT as a supervised learning problem with labels "y" and use this as the warm-start
warm_start = :oct
lnr_ws_oct = ICOT.InterpretableCluster(ls_num_tree_restarts=num_tree_restarts, ls_random_seed=seed, cp=complexity_c, max_depth=maxdepth,
    minbucket=min_bucket, criterion=cr, ls_warmstart_criterion=cr, kmeans_warmstart=warm_start,
    geom_search=geom_search, geom_threshold=threshold);
run_time_icot_ls_oct = @elapsed ICOT.fit!(lnr_ws_oct, X, y);

score_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:dunnindex);
score_al_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:silhouette);


In [None]:

score_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:dunnindex);
score_al_ws_oct = ICOT.score(lnr_ws_oct, X, y, criterion=:silhouette);
