## DIstributed Shared Nearest Neighbor experiments

In [None]:
if length(workers()) > 1
    println("Removing previous workers...")
    rmprocs(workers())
end

nofworkers = 7
addprocs(nofworkers)

push!(LOAD_PATH, pwd())
@everywhere using Distances
@everywhere using StatsBase
@everywhere using Clustering
@everywhere using LightGraphs
@everywhere include("WorkerSNN.jl")
@everywhere include("MasterSNN.jl")
@everywhere include("IOSNN.jl")
@everywhere include("SNNDBSCAN.jl")
@everywhere include("SNNGraphUtil.jl")

using PyCall
@pyimport clustering_scores as cs #clustering_scores.py must be in the path.
using JLD

#DATA_PATH = "./toy_example.dat";
#LABEL_PATH = "./toy_example.dat.labels";

#DATA_PATH = "./blobs.dat";
#LABEL_PATH = "./blobs.dat.labels";

#DATA_PATH = "./TDT2/tdt2_tfidf_top30.csv";
#LABEL_PATH = "./TDT2/tdt2_tfidf_top30.csv.labels";

#DATA_PATH = "./RCV1/reuters_single_tfidf_top30.csv";
#LABEL_PATH = "./RCV1/reuters_single_tfidf_top30.csv.labels";

#DATA_PATH = "./20newsgroups/20ng_tfidf_cai.csv";
#LABEL_PATH = "./20newsgroups/20ng_tfidf_cai.csv.labels";

DATA_PATH = "./20newsgroups/20ng_tfidf_cai_top10.csv";
LABEL_PATH = "./20newsgroups/20ng_tfidf_cai_top10.csv.labels";


real_labels = vec(readdlm(LABEL_PATH, Int32));
N, dim = get_header_from_input_file(DATA_PATH);

#DATA = zeros(dim,N);
#get_cluto_data(DATA, DATA_PATH);
println("Dataset ",DATA_PATH," loaded (#Docs:",N,"/#Features:",dim,")");
pct_sample = 10; pct_sample = pct_sample/100; # (%) percentage of each local worker that will be sampled and transmitted to the Mas2ter

#global score statistics (along cut_point values)
summary_scores = Dict{String, Array{Tuple{Float64, Float64}, 1}}("elapsed"=>[], "bytesalloc" => [], "E"=>[], "P" => [], "ARI" => [], "AMI" => [], "NMI" => [], "H" => [], "C" => [], "VM" => [])
nruns = 10;# number of runs per cut_point value
cut_values = collect(5:5:40);
for cut_point=cut_values
    @printf "Starting runs with snn_cut_point:%d\n" cut_point 
    #score values attained along runs
    run_scores = Dict{String, Array{Float64, 1}}("elapsed"=>[], "bytesalloc" => [], "E"=>[], "P" => [], "ARI" => [], "AMI" => [], "NMI" => [], "H" => [], "C" => [], "VM" => [])

    for run_no=collect(1:nruns)
        partition = generate_partition(nofworkers, N); #N instances assigned to nofworkers cores.
        # Performs the clustering task
        results = Dict{String,Any}()        
        _, elapsed_t, bytes_alloc, _, _ = @timed master_work(results, DATA_PATH, partition, pct_sample, similarity="cosine", KNN=7, Eps_range=collect(5:5:40.0), MinPts_range=collect(5:5:40), k_range=[50], snn_cut_point=cut_point);

        push!(run_scores["elapsed"], elapsed_t)
        push!(run_scores["bytesalloc"], bytes_alloc)
        
        scores = cs.clustering_scores(real_labels, results["assignments"], false);
        scores = convert(Dict{String, Float64}, scores);
        for qm=keys(scores)
            push!(run_scores[qm], scores[qm])
        end    
    end
    #compute mean and std for each score (along runs)
    for qm=keys(run_scores)
        qm_mean = mean(run_scores[qm])
        qm_std = std(run_scores[qm])
        push!(summary_scores[qm], (qm_mean, qm_std))
    end 
end
# save summary_scores
jldoutput = join([DATA_PATH[1:end-4],"_summary.jld"]);#assumes that the data file ends with '.csv'
JLD.save(jldoutput, "summary_scores", summary_scores)
println("Storing summary to:", jldoutput)

In [27]:
function printable_run_score(dataset::String, cut_val::Int64, score_dict::Dict{Any, Any})
    vm = score_dict["VM"];
    ari = score_dict["ARI"];
    nmi = score_dict["NMI"];
    ami = score_dict["AMI"];
    p = score_dict["P"];
    e = score_dict["E"];
    
    output = ""
    #output = "|    dataset    | cut-point  |     VM     |     ARI    |     NMI    |     AMI    |      P     |      E     |\n|---------------|------------|------------|------------|------------|------------|------------|------------|\n"
    output = join([output,
        @sprintf "|  %s |  %d  | %.4f  |  %.4f  |  %.4f  |  %.4f  |  %.4f | %.4f  |" dataset cut_val vm ari nmi ami p e])
    return  output
end

#using PyCall
#@pyimport clustering_scores as cs #clustering_scores.py must be in the path.
#scores = cs.clustering_scores(real_labels, results["assignments"], false);
#println(printable_run_score("20NG", 5, scores))


function printable_run_score(dataset::String, cut_val::Int64, mean_std_dict::Dict{String, Tuple{Float64, Float64}})
    # Generates a single row with the output in Markdown
    measures = sort(collect(keys(mean_std_dict)))
    output = @sprintf "|  %s |  %d  |" dataset cut_val
    for qm=measures
        mean_v, std_v = mean_std_dict[qm];
        if qm == "bytesalloc"
            mean_v = mean_v / (1024*1024)
            std_v = std_v / (1024*1024)
        end
        current_cell = @sprintf "  %.4f(%.4f)  |" mean_v std_v;
        output = join([output,current_cell])
    end
    return output
end

function printable_run_score_summary(dataset::String, cut_values::Array{Int64,1}, score_dict::Dict{String, Array{Tuple{Float64, Float64}, 1}})
    # Generates the whole table in Markdown by calling the funciton above for each row
    measures = sort(collect(keys(score_dict)));
    output = "|    dataset    | cut-point  |"
    n_cols = 2 + length(measures)
    for qm=measures
        output = join([output, @sprintf "     %s   |" qm])
    end
    output = join([output, "\n|"])
    #VM     |     ARI    |     NMI    |     AMI    |      P     |      E     |
    for i=collect(1:n_cols)
        output = join([output, "---------------|"])
    end
    
    #one row per cut_value
    for cut_ix=collect(1:length(cut_values))
        mean_std_dict = Dict{String, Tuple{Float64, Float64}}()
        for qm=measures
            mean_std_dict[qm] = score_dict[qm][cut_ix]
        end
        output = join([output, "\n", printable_run_score(dataset, cut_values[cut_ix], mean_std_dict)])
    end
    return output
end


function printable_run_score_tex(dataset::String, cut_val::Int64, mean_std_dict::Dict{String, Tuple{Float64, Float64}}, usedcolumns::Array{String})
    # Generates a single row with the output in Markdown
    #measures = sort(collect(keys(mean_std_dict)))
    measures = usedcolumns;
    output = @sprintf "%s & %d " dataset cut_val
    for qm=measures
        mean_v, std_v = mean_std_dict[qm];
        if qm == "bytesalloc"
            mean_v = mean_v / (1024*1024)
            std_v = std_v / (1024*1024)
        end
        current_cell = @sprintf "& %.4f(%.4f)" mean_v std_v;
        output = join([output,current_cell])
    end
    output = join([output,"\\\\ \\hline"])
    return output
end

#=
\begin{center}
  \begin{tabular}{ | l | c | r }
    \hline
    1 & 2 & 3 \\ \hline
    4 & 5 & 6 \\ \hline
    7 & 8 & 9 \\
    \hline
  \end{tabular}
\end{center}
=#

#function printable_run_score_summary_tex(dataset::String, cut_values::Array{Int64,1}, score_dict::Dict{String, Array{Tuple{Float64, Float64}, 1}};usedcolumns::Array{String}=String[])
function printable_run_score_summary_tex(dataset::String, cut_values::Array{Int64,1}, score_dict::Dict{String, Any};usedcolumns::Array{String}=String[])
    # Generates the whole table in Markdown by calling the funciton above for each row
    measures = String[];
    if length(usedcolumns) > 0
        append!(measures, usedcolumns)
    else
        append!(measures, sort(collect(keys(score_dict))) );
    end 
    n_cols = 2 + length(measures)
    
    output = "\% Ensure that \\usepackage{pdflscape} is used.\n\n";
    output = join([output,"\\begin{landscape}\n\\begin{center}\n\\resizebox{\\columnwidth}{!}{%\n\\begin{tabular}{|"]);
    for i=collect(1:n_cols)
        output = join([output, "c|"])
    end
    output = join([output,"}\n\\hline\n"]);
    
    output = join([output,"\\textbf{dataset}&\\textbf{cutpoint}"])
    for qm=measures
        output = join([output, @sprintf "&\\textbf{%s}" qm])
    end    
    output = join([output, "\\\\ \\hline\n"])
    ##
    #=
    for i=collect(1:n_cols)
        output = join([output, "---------------|"])
    end
    =#

    #one row per cut_value
    for cut_ix=collect(1:length(cut_values))
        mean_std_dict = Dict{String, Tuple{Float64, Float64}}()
        for qm=measures
            mean_std_dict[qm] = score_dict[qm][cut_ix]
        end
        output = join([output, "\n", printable_run_score_tex(dataset, cut_values[cut_ix], mean_std_dict, measures)])
    end

    ##
    output = join([output, "\n\\end{tabular}}\n\\end{center}\n\\end{landscape}"])
    return output
end


using JLD
cut_values = collect(5:5:40);
includedcols = ["VM", "NMI", "AMI","ARI"];
#=
summary = JLD.load("./20newsgroups/20ng_tfidf_cai_top10_summary.jld")["summary_scores"]
println(printable_run_score_summary_tex(@sprintf("20Ng(top-10) (#w:%d)",summary["nworkers"]), summary["cut_range"], summary, usedcolumns=includedcols))

summary = JLD.load("./20newsgroups/20ng_tfidf_cai_summary.jld")["summary_scores"]
println(printable_run_score_summary_tex(@sprintf("20Ng(all) (#w:%d)",summary["nworkers"]), summary["cut_range"], summary, usedcolumns=includedcols))

summary = JLD.load("./RCV1/reuters_single_tfidf_top30_summary.jld")["summary_scores"]
println(printable_run_score_summary_tex(@sprintf("Reuters(top-30) (#w:%d)",summary["nworkers"]), summary["cut_range"], summary, usedcolumns=includedcols))

summary = JLD.load("./TDT2/tdt2_tfidf_top30_summary.jld")["summary_scores"]
println(printable_run_score_summary_tex(@sprintf("TDT2(top-30) (#w:%d)",summary["nworkers"]), summary["cut_range"], summary, usedcolumns=includedcols))
=#
summary = JLD.load("./KDD2004/phy_filtered_summary.jld")["summary_scores"]
println(printable_run_score_summary_tex(@sprintf("KDD04(train) (#w:%d)",summary["nworkers"]), summary["cut_range"], summary, usedcolumns=includedcols))

% Ensure that \usepackage{pdflscape} is used.

\begin{landscape}
\begin{center}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{|c|c|c|c|c|c|}
\hline
\textbf{dataset}&\textbf{cutpoint}&\textbf{VM}&\textbf{NMI}&\textbf{AMI}&\textbf{ARI}\\ \hline

KDD04(train) (#w:35) & 50 & 0.0245(0.0011)& 0.0388(0.0017)& 0.0131(0.0006)& 0.0019(0.0004)\\ \hline
KDD04(train) (#w:35) & 55 & 0.0247(0.0010)& 0.0390(0.0016)& 0.0132(0.0006)& 0.0020(0.0003)\\ \hline
KDD04(train) (#w:35) & 60 & 0.0247(0.0010)& 0.0391(0.0015)& 0.0132(0.0005)& 0.0018(0.0003)\\ \hline
KDD04(train) (#w:35) & 65 & 0.0245(0.0008)& 0.0387(0.0013)& 0.0131(0.0005)& 0.0020(0.0003)\\ \hline
KDD04(train) (#w:35) & 70 & 0.0248(0.0008)& 0.0394(0.0014)& 0.0133(0.0005)& 0.0020(0.0004)\\ \hline
KDD04(train) (#w:35) & 75 & 0.0248(0.0007)& 0.0393(0.0012)& 0.0132(0.0004)& 0.0020(0.0003)\\ \hline
KDD04(train) (#w:35) & 80 & 0.0248(0.0008)& 0.0393(0.0012)& 0.0133(0.0004)& 0.0021(0.0003)\\ \hline
KDD04(train) (#w:35) & 85 & 0.0242(0.0012)& 0.0383(0.001



In [29]:
summary["epsilon_range"]

(5.0,10.0,Inf)

In [None]:
matshow(results["sampled_data_snn"])

In [None]:
# Creating a vector with pairwise snn-similarities 
L = Int64[]
for i=collect(1:size(results["sampled_data_snn"],1)-1)
    for j=collect((i+1):size(results["sampled_data_snn"],1))
        push!(L, results["sampled_data_snn"][i,j]);
    end
end

test_hist = fit(Histogram, L);
cpL = ecdf(L); # cumulative dist.

fig = figure("pyplot_subplot_column",figsize=(10,10));
subplot(211);
grid();
title("PDF of pairwise snn-similarities");
bar(0:length(test_hist.weights)-1, test_hist.weights);
xticks(0:length(test_hist.weights), test_hist.edges[1]);

subplot(212);
title("CDF of pairwise snn-similarities");
x = collect(1:test_hist.edges[1][end]);
y = cpL(x);
grid();
bar(x,y)

### Centralized experiments

In [None]:
include("centralized_experiment.jl")

In [None]:
DATA_PATH = "./TDT2/tdt2_tfidf_top30.csv";
LABEL_PATH = "./TDT2/tdt2_tfidf_top30.csv.labels";
println("TDT2 top30")
println(snn_grid_evaluation(DATA_PATH, LABEL_PATH))

println("RCV1 top30")
DATA_PATH = "./RCV1/reuters_single_tfidf_top30.csv";
LABEL_PATH = "./RCV1/reuters_single_tfidf_top30.csv.labels";
println(snn_grid_evaluation(DATA_PATH, LABEL_PATH))

println("20NG")
DATA_PATH = "./20newsgroups/20ng_tfidf_cai.csv";
LABEL_PATH = "./20newsgroups/20ng_tfidf_cai.csv.labels";
println(snn_grid_evaluation(DATA_PATH, LABEL_PATH))

println("20NG top10")
DATA_PATH = "./20newsgroups/20ng_tfidf_cai_top10.csv";
LABEL_PATH = "./20newsgroups/20ng_tfidf_cai_top10.csv.labels";
println(snn_grid_evaluation(DATA_PATH, LABEL_PATH))

__Centralized SNN Clustering results__
```
TDT2 top 30 (9394 x 36771) 
K       : 60
epsilon : 25.0
minpts  : 45
VM      : 0.769667
elapsed : 0.857351

RCV1 top30 (8067 x 18933)
K       : 80
epsilon : 35.0
minpts  : 65
VM      : 0.482285
elapsed : 0.967809

20NG (18846 x 26214)
K       : 30
epsilon : 15.0
minpts  : 15
VM      : 0.300971
elapsed : 3.92845

20NG top10 (9917 x 26214)
K       : 200
epsilon : 75.0
minpts  : 145
VM      : 0.448386
elapsed : 0.954286
```

# ZONA DE PUEBAS

In [None]:
num_points = size(DATA,2);
Snn = zeros(Float64, num_points, num_points);
S = zeros(Float64, num_points, num_points);
shared_nn_sim(DATA, 110, Snn, S, similarity="cosine");

In [None]:
d_point_cluster_id = Dict{Int64, Int64}();
cluster_assignment = fill(SNNDBSCAN.UNCLASSIFIED, num_points);
corepoints = Int64[];

for i=collect(1:num_points)
    d_point_cluster_id[i] = SNNDBSCAN.UNCLASSIFIED;
end

SNNDBSCAN.dbscan(num_points, 25.0, 30, Snn, d_point_cluster_id, corepoints)
for i=collect(1:num_points)
    cluster_assignment[i] = d_point_cluster_id[i]
end
using PyCall
@pyimport clustering_scores as cs #clustering_scores.py must be in the path.
scores = cs.clustering_scores(real_labels, cluster_assignment, false)

In [None]:
params = tuned_snn_clustering(DATA, Eps_range = collect(3.0:15.0:40.0), MinPts_range = collect(10:10:30), k_range = [40, 90], similarity="cosine")
#assigned = find(x-> x>0, cluster_assignment)
#mean(silhouettes(cluster_assignment[assigned], counts(cluster_assignment[assigned],maximum(cluster_assignment[assigned])), 110-Snn[assigned,assigned]))

### Kmeans results: Latex report

In [None]:
using PyCall
@pyimport pickle
includedcols = ["VM", "NMI", "AMI","P","E"];

f=open("./20newsgroups/20ng_tfidf_cai_top10_kmeans_summary.pkl","r")
km_summ = pickle.load(PyTextIO(f))
kmscores = convert(Dict{String, Array{Tuple{Float64, Float64}, 1}}, km_summ)
println(printable_run_score_summary_tex("20NG (top 10)", collect(2:2:30), kmscores, usedcolumns=includedcols))
close(f)

f=open("./20newsgroups/20ng_tfidf_cai_kmeans_summary.pkl","r")
km_summ = pickle.load(PyTextIO(f))
kmscores = convert(Dict{String, Array{Tuple{Float64, Float64}, 1}}, km_summ)
println(printable_run_score_summary_tex("20NG", collect(2:2:30), kmscores, usedcolumns=includedcols))
close(f)

f=open("./RCV1/reuters_single_tfidf_top30_kmeans_summary.pkl","r")
km_summ = pickle.load(PyTextIO(f))
kmscores = convert(Dict{String, Array{Tuple{Float64, Float64}, 1}}, km_summ)
println(printable_run_score_summary_tex("RCV1 (top 30)", collect(2:2:30), kmscores, usedcolumns=includedcols))
close(f)

f=open("./TDT2/tdt2_tfidf_top30_kmeans_summary.pkl","r")
km_summ = pickle.load(PyTextIO(f))
kmscores = convert(Dict{String, Array{Tuple{Float64, Float64}, 1}}, km_summ)
println(printable_run_score_summary_tex("TDT2 (top 30)", collect(2:2:30), kmscores, usedcolumns=includedcols))
close(f)

In [None]:
### Repeated Bisection results: Latex report

In [None]:
f=open("./20newsgroups/20ng_tfidf_cai_top10_kmeansRB_summary.pkl","r")
km_summ = pickle.load(PyTextIO(f))
kmscores = convert(Dict{String, Array{Tuple{Float64, Float64}, 1}}, km_summ)
println(printable_run_score_summary_tex("20NG (top 10)", collect(2:2:30), kmscores, usedcolumns=includedcols))
close(f)

f=open("./20newsgroups/20ng_tfidf_cai_kmeansRB_summary.pkl","r")
km_summ = pickle.load(PyTextIO(f))
kmscores = convert(Dict{String, Array{Tuple{Float64, Float64}, 1}}, km_summ)
println(printable_run_score_summary_tex("20NG", collect(2:2:30), kmscores, usedcolumns=includedcols))
close(f)

f=open("./RCV1/reuters_single_tfidf_top30_kmeansRB_summary.pkl","r")
km_summ = pickle.load(PyTextIO(f))
kmscores = convert(Dict{String, Array{Tuple{Float64, Float64}, 1}}, km_summ)
println(printable_run_score_summary_tex("RCV1 (top 30)", collect(2:2:30), kmscores, usedcolumns=includedcols))
close(f)

f=open("./TDT2/tdt2_tfidf_top30_kmeansRB_summary.pkl","r")
km_summ = pickle.load(PyTextIO(f))
kmscores = convert(Dict{String, Array{Tuple{Float64, Float64}, 1}}, km_summ)
println(printable_run_score_summary_tex("TDT2 (top 30)", collect(2:2:30), kmscores, usedcolumns=includedcols))
close(f)

In [None]:
-

In [None]:
println("Elapsed time:", elapsed_t)

In [None]:
results

In [None]:
_, elapsed_t, bytes_alloc, _, _ = @timed 7^1000000000;

## Experiments using the Cluto commands

Este código puede ser usado para reemplazar su par en Python.

In [None]:
#DATA_PATH = "./20newsgroups/20ng_tfidf_cai_top10.csv"
#LABEL_PATH = "./20newsgroups/20ng_tfidf_cai_top10.csv.labels"
DATA_PATH = "./KDD-CUP-99/corrected_numeric_sparse.csv"
LABEL_PATH = "./KDD-CUP-99/corrected_numeric.csv.labels"
real_labels = vec(readdlm(LABEL_PATH, Int32));

In [None]:
function exec_cluto_rb(vectors_file::String, nclusters::Int64; CLUTOV_CMD::String="./cluto-2.1.2/Linux-x86_64/vcluster"
)
    output = readstring(`$CLUTOV_CMD -clustfile=$vectors_file.k$nclusters $vectors_file $nclusters`);
    
    assign_fpath=@sprintf("%s.k%d", vectors_file,nclusters)
    
    f = open(assign_fpath);
    labels=Int64[];
    for ln in eachline(f)
        lbl_i=parse(Int64, ln);
        push!(labels, lbl_i);
    end
    close(f)
    return labels
end

In [None]:
cluster_assignment=exec_cluto_rb(DATA_PATH, 40);
using PyCall
@pyimport clustering_scores as cs #clustering_scores.py must be in the path.
scores = cs.clustering_scores(real_labels, cluster_assignment, false)

### Centralized SNN

In [1]:
include("IOSNN.jl");
include("WorkerSNN.jl");
include("SNNDBSCAN.jl");
using PyCall
@pyimport clustering_scores as cs #clustering_scores.py must be in the path.

In [2]:
#DATA_PATH = "./20newsgroups/20ng_tfidf_cai_top10.csv";
#LABEL_PATH = "./20newsgroups/20ng_tfidf_cai_top10.csv.labels";
DATA_PATH = "./cure_small.dat";
LABEL_PATH = "./cure_small.dat.labels";
####
real_labels = vec(readdlm(LABEL_PATH, Int64));
num_points, dim = get_header_from_input_file(DATA_PATH);
D = zeros(dim, num_points);
get_cluto_data(D, DATA_PATH);
####
S = zeros(Float64, num_points, num_points);
cosine_sim(D, S);
Snn = zeros(Float64, num_points, num_points);

In [3]:
####
shared_nn_sim(D, 150, Snn, S)

In [4]:
Eps, MinPts = 140, 10

@time begin
    cluster_assignment = SNNDBSCAN.snn_clustering(convert(Float64, Eps), MinPts, Snn)
end
scores = cs.clustering_scores(real_labels, cluster_assignment, false)

  0.098738 seconds (39.11 k allocations: 41.050 MB, 5.23% gc time)


Dict{Any,Any} with 8 entries:
  "AMI" => 0.557323
  "P"   => 0.793665
  "C"   => 0.560338
  "NMI" => 0.609712
  "E"   => 0.304178
  "VM"  => 0.607544
  "H"   => 0.663436
  "ARI" => 0.465132

#### ICVNN Measure

In [5]:
function cvnn_index(D::Array{Float64,2}, labels::Array{Int64,1}; sep_k::Int64=10)
    #computing separation
    sep_score = 0.0
    com_score = 0.0
    for c=unique(labels)
        points_in_c = find(x->x==c, labels)
        n_c = size(points_in_c)[1]

        sum_c = 0.0
        for j=points_in_c
            knn_j = sortperm(D[:,j])[2:(sep_k + 1)] #k-nst-n (ascending order in dist)
            q_j = size(find(x->x!=c, labels[knn_j]))[1] #nst-n in different group
            sum_c += q_j/sep_k
        end
        sep_c = (1.0/n_c)*sum_c #average weight for objs in the current cluster.
        if sep_c > sep_score
            sep_score = sep_c
        end
        ##
        sum_c = 0.0
        sims_c = D[points_in_c,points_in_c]
        for i=collect(1:(n_c-1))
            for j=collect((i+1):n_c)
                sum_c += D[i,j]
            end
        end
        com_score += (2.0/(n_c*(n_c-1)))*sum_c        
    end
    return (com_score + sep_score)
end



cvnn_index (generic function with 1 method)

In [6]:
cvnn_index(150-Snn, cluster_assignment)

1481.9679429665641

In [7]:
cvnn_index(150-Snn, real_labels)

933.2097563167848

In [8]:
cvnn_index(1-S, cluster_assignment)

10.940735332432107

In [9]:
cvnn_index(1-S, real_labels)

7.484706011523611

In [64]:
function silhouette(D::Array{Float64,2}, labels::Array{Int64,1})
    #D: Distance matrix
    #labels: vector with assignments
    #
    # The best value is 1 and the worst value is -1. 
    # Values near 0 indicate overlapping clusters. 
    # Negative values generally indicate that a sample has been assigned to the wrong cluster
    #
    n = size(D)[1]
    sil_sum = 0.0
    for i=collect(1:n)
        sil_sum += silhouette_i(i, D, labels)
    end
    return sil_sum / n
end
    
function silhouette_i(i::Int64, D::Array{Float64,2}, labels::Array{Int64,1})    
    #i: current point to examine
    #D: Distance matrix
    #labels: vector with assignments
    #
    # Compute the Silhouette Coefficient for a specific sample.
    #
    A = labels[i]
    points_in_A = find(x->x==A, labels)
    a_i = (sum(D[i, points_in_A])-D[i,i])/(size(points_in_A)[1] - 1)#It is assumed that D[i,i]:=0
    b_i = Inf
    for c=unique(labels) #computing min ave.dist among i and items in other clusters.
        if c == A
            continue
        end
        points_in_c = find(x->x==c, labels)
        ave_dist_i = sum(D[i, points_in_c])/size(points_in_c)[1]
        if ave_dist_i < b_i
            b_i = ave_dist_i
        end
    end
    return (b_i - a_i)/(max(b_i, a_i))
end



silhouette_i (generic function with 1 method)

In [68]:
println(silhouette(150-Snn, cluster_assignment))

0.32684432851919676


In [39]:
using Clustering
assigned = find(x-> x>0, cluster_assignment);
mean(Clustering.silhouettes(cluster_assignment[assigned], Clustering.counts(cluster_assignment[assigned],maximum(cluster_assignment[assigned])), 150-Snn[assigned,assigned]))

0.3689959974079539

In [58]:
function label_encoding(labels::Array{Int64})
    new_label_map = Dict{Int64, Int64}();
    p = 0;
    for c=unique(labels)
        p += 1;
        new_label_map[c] = p;
    end

    new_labels = map((x) -> new_label_map[x], labels);
    return new_labels
end

label_encoding (generic function with 1 method)

In [56]:
mean(Clustering.silhouettes(new_labels, Clustering.counts(new_labels,maximum(new_labels)), 150-Snn))

0.3268443285191965

In [2]:
collect(1:5:20)

4-element Array{Int64,1}:
  1
  6
 11
 16