In [28]:
using PyPlot, DataFrames, Combinatorics

In [3]:
include("../src/SparseKMeans.jl")
include("../src/RobustKMeans.jl")
include("../src/DataGeneration.jl");

Academic license - for non-commercial use only


In [71]:
n = 100
p = 50
K = 3
σ_noise = 1.0
true_features_pct = 0.01
SNR_grid = [1.1, 1.5, 2, 5]
d_grid = [100, 500, 1000, 5000, 10000]
sparse_kmeans_results = []
robust_kmeans_results = []
for SNR in SNR_grid
    #p = Int.(round(d * true_features_pct))
    for d in d_grid
        σ_group = SNR * σ_noise
        X, true_assignments, true_features = generate_random_data(n, d, p, K, σ_noise=σ_noise, σ_group=σ_group)
        selected_features, sparse_cluster_assignments = sparse_kmeans(X, K, p)
        feature_weights, robust_cluster_assignments = robust_kmeans(X, K, λ=float(p))
        push!(
            sparse_kmeans_results,
            Dict(
                :n => n,
                :d => d,
                :p => p,
                :K => K,
                :SNR => SNR,
                :selected_features => selected_features,
                :cluster_assignments => sparse_cluster_assignments,
                :true_features => true_features,
                :true_assignments => true_assignments
            )
        )
        push!(
            robust_kmeans_results,
            Dict(
                :n => n,
                :d => d,
                :p => p,
                :K => K,
                :SNR => SNR,
                :feature_weights => feature_weights,
                :cluster_assignments => robust_cluster_assignments,
                :true_features => true_features,
                :true_assignments => true_assignments
            )
        )
        println("Completed iteration with d=$d and SNR=$SNR.")
    end
end

Completed iteration with d=100 and SNR=1.1.
Completed iteration with d=500 and SNR=1.1.
Completed iteration with d=1000 and SNR=1.1.
Completed iteration with d=5000 and SNR=1.1.
Completed iteration with d=10000 and SNR=1.1.
Completed iteration with d=100 and SNR=1.5.
Completed iteration with d=500 and SNR=1.5.
Completed iteration with d=1000 and SNR=1.5.
Completed iteration with d=5000 and SNR=1.5.
Completed iteration with d=10000 and SNR=1.5.
Completed iteration with d=100 and SNR=2.0.
Completed iteration with d=500 and SNR=2.0.
Completed iteration with d=1000 and SNR=2.0.
Completed iteration with d=5000 and SNR=2.0.
Completed iteration with d=10000 and SNR=2.0.
Completed iteration with d=100 and SNR=5.0.
Completed iteration with d=500 and SNR=5.0.
Completed iteration with d=1000 and SNR=5.0.
Completed iteration with d=5000 and SNR=5.0.
Completed iteration with d=10000 and SNR=5.0.


In [67]:
function get_selected_features_from_weights(feature_weights::Array{Float64, 1}, p)
    d = size(feature_weights, 1)
    selected_features = Int64[]
    for j=1:p
        push!(
            selected_features,
            argmax(feature_weights .* [!(j in selected_features) for j=1:d])
        )
    end
    return selected_features
end

get_selected_features_from_weights (generic function with 1 method)

In [68]:
function calculate_feature_tpr(
    true_features::Array{Int64, 1}, 
    selected_features::Array{Int64, 1},
    p::Int64
)::Float64
    return sum([j in true_features for j in selected_features])/p
end

calculate_feature_tpr (generic function with 1 method)

In [69]:
function calculate_assignment_accuracy(
    true_assignments::Array{Int64, 1}, 
    cluster_assignments::Array{Int64, 1},
    K::Int64    
)::Float64
     best_score = 0
    for perm in permutations(1:K)
        perm_score = 0
        for k=1:K
            perm_score += sum((true_assignments .== k) .* (cluster_assignments .== perm[k]))
        end
        if perm_score > best_score
            best_score = perm_score
        end
    end
    return best_score / size(true_assignments, 1)
end

calculate_assignment_accuracy (generic function with 3 methods)

In [73]:
sparse_tpr = [
    calculate_feature_tpr(result[:true_features], result[:selected_features], result[:p])
    for result in sparse_kmeans_results
]
robust_tpr = [
    calculate_feature_tpr(result[:true_features], get_selected_features_from_weights(result[:feature_weights], p),
        result[:p])
    for result in robust_kmeans_results
]
sparse_accruacy = [
    calculate_assignment_accuracy(result[:true_assignments], result[:cluster_assignments], result[:K])
    for result in sparse_kmeans_results
]
robust_accruacy = [
    calculate_assignment_accuracy(result[:true_assignments], result[:cluster_assignments], result[:K])
    for result in sparse_kmeans_results
];