# Combined analysis 
Analysis on combinations of all PH and Dowker PH features

In [None]:

include("../src/ECM_TDA.jl")
using .ECM_TDA

using Eirene
using Ripserer
using CSV
using TiffImages
using Images
using NPZ
using Plots
using PersistenceDiagrams
using Measures
using Distributions
using MultivariateStats
using LinearAlgebra
using Random
using StatsBase
using JLD2
using FileIO
using PersistenceDiagrams
using DelimitedFiles
using Distances
using UMAP
using Clustering


In [None]:
# color palettes
c_ECM = "#249EA0" # teal
c_cancer = "#592693" # purple
c_leukocytes = "#FAAB36" # orange

# color palettes

c1 = "#fd5184" # pink
c2 = "#ffb602" # orange
c3 = "#3ec0c9" # blue / teal 
c4 = "#d753ce" # purple
c5 = "#49a849"  # green
c6 = "#F28522"
c7 = "#265BF5"
c8 = "#AEF359" # lime green



## Create a combined feature from all PH and Dowker PH features

In [None]:
### load all persistence diagrams
# load ECM features
ECM_PD0 = load("data/4000x4000_combined/ECM_PD/PD.jld2")["PD0_ECM"]
ECM_PD1 = load("data/4000x4000_combined/ECM_PD/PD.jld2")["PD1_ECM"]

# load cancer & leukocyte features
PD = load("data/4000x4000_combined/cells_PD/PD.jld2")
cancer_PD0 = PD["PD0_cancer"]
cancer_PD1 = PD["PD1_cancer"]
leukocytes_PD0 = PD["PD0_leukocytes"]
leukocytes_PD1 = PD["PD1_leukocytes"]

# load Dowker: (cancer, leukocytes) features
data = load("data/4000x4000_combined/Dowker/cancer_leukocytes/PD.jld2")
CL_PD0 = data["PD0"]
CL_PD1 = data["PD1"];

# load Dowker: (ECM, cancer) features
data = load("data/4000x4000_combined/Dowker/ECM_cancer/PD.jld2")
ECM_C_PD0 = data["PD0"]
ECM_C_PD1 = data["PD1"];

# load Dowker: (ECM, leukocytes) features
data = load("data/4000x4000_combined/Dowker/ECM_leukocytes/PD.jld2")
ECM_L_PD0 = data["PD0"]
ECM_L_PD1 = data["PD1"];

# load all cell analysis
data = load("data/4000x4000_combined/all_cells_PD/PD.jld2")
all_cells_PD0 = data["PD0"]
all_cells_PD1 = data["PD1"];

In [None]:
# define new get_PD

get_PD0_max2(PD_dict) = maximum([maximum(PD_dict[i][1:end-1,:]) for (i,v) in PD_dict if (v != reshape(Array([0.0]), 1, 1)) & (v != nothing) ])
get_PD1_max3(PD_dict) = maximum([maximum(PD_dict[i]) for (i,v) in PD_dict if (v != reshape(Array([0.0]), 1, 1)) & (v != nothing) ])

In [None]:
println(get_PD1_max3(ECM_PD1))
println(get_PD1_max3(leukocytes_PD1))
println(get_PD1_max3(cancer_PD1))
println(get_PD1_max3(CL_PD1))
println(get_PD1_max3(ECM_C_PD1))
println(get_PD1_max3(ECM_L_PD1))
println(get_PD1_max3(all_cells_PD1))

In [None]:
# compute PIs
ECM_PI0 = compute_PI2(ECM_PD0)
ECM_PI1 = compute_PI2(ECM_PD1)

cancer_PI0 = compute_PI2(cancer_PD0)
cancer_PI1 = compute_PI2(cancer_PD1)

leukocytes_PI0 = compute_PI2(leukocytes_PD0)
leukocytes_PI1 = compute_PI2(leukocytes_PD1)

CL_PI0 = compute_PI2(CL_PD0)
CL_PI1 = compute_PI2(CL_PD1)

ECM_C_PI0 = compute_PI2(ECM_C_PD0)
ECM_C_PI1 = compute_PI2(ECM_C_PD1)

ECM_L_PI0 = compute_PI2(ECM_L_PD0)
ECM_L_PI1 = compute_PI2(ECM_L_PD1)

all_cells_PI0 = compute_PI2(all_cells_PD0)
all_cells_PI1 = compute_PI2(all_cells_PD1);

In [None]:
# select dictionaries
dicts = [ECM_PI0, 
         ECM_PI1,
        cancer_PI0,
        cancer_PI1,
        leukocytes_PI0,
        leukocytes_PI1,
        CL_PI0,
        CL_PI1,
        ECM_C_PI0,
        ECM_C_PI1,
        ECM_L_PI0,
        ECM_L_PI1,
        all_cells_PI0,
        all_cells_PI1
        ];

# get keys that are present in all dictionaries
all_keys = []
for k in keys(dicts[1])
    present = 0
    for j = 2:length(dicts)
        if k in keys(dicts[j])
            present += 1
        end
    end

    if present == length(dicts) - 1
        push!(all_keys, k)
    end
end

# combine all features
features = Dict()
for f in all_keys
     combined = vcat(ECM_PI0[f], 
                    cancer_PI0[f], 
                    leukocytes_PI0[f], 
                    vec(ECM_PI1[f]),
                    vec(cancer_PI1[f]),
                    vec(leukocytes_PI1[f]),
                    vec(CL_PI0[f]),
                    vec(CL_PI1[f]),
                    vec(ECM_C_PI0[f]),
                    vec(ECM_C_PI1[f]),
                    vec(ECM_L_PI0[f]),
                    vec(ECM_L_PI1[f]),
                    vec(all_cells_PI0[f]),
                    vec(all_cells_PI1[f])
                    )
     features[f] = combined
    
end

In [None]:
ROIs = collect(keys(features))
idx_ROI = Dict(i => roi for (i, roi) in enumerate(ROIs));

In [None]:
#save("analysis/combined/idx_ROI.jld2", "idx_ROI", idx_ROI)
idx_files = load("analysis/combined/idx_ROI.jld2")["idx_ROI"];
file_idx = Dict(v => k for (k,v) in idx_files);

In [None]:
# prepare features array
n = length(ROIs)
features_array = hcat([features[idx_files[i]] for i = 1:n]...)
println("features array shape: ", size(features_array))

features_centered = features_array .- mean(features_array, dims = 2);

In [None]:
#save("analysis/combined/features.jld2", "features", features_array)

## UMAP on combined feature

In [None]:
idx_files = load("analysis/combined/idx_ROI.jld2")["idx_ROI"];
file_idx = Dict(v => k for (k,v) in idx_files);

features_array = load("analysis/combined/features.jld2")["features"]
features_centered = features_array .- mean(features_array, dims = 2);

In [None]:
# compute UMAP & save
#embedding = umap(features_centered, 2; n_neighbors = 5);
#writedlm("analysis/combined/umap.csv", embedding, ",")

In [None]:

embedding = Array(CSV.read("analysis/combined/umap.csv", header = false))

## hierarchical clustering

In [None]:
function print_cluster_sizes(hc_clusters)
    clusters = unique(hc_clusters)
    for i in clusters
       println("size of cluster " * string(i) * ": ", count(x -> x == i, hc_clusters)) 
    end
end

In [None]:
# prepare features array

dimred_embedding = umap(features_centered, 2; n_neighbors = 5, min_dist = 0.00001);
println("size of reduced dimension embedding: ", size(dimred_embedding))

d = Distances.pairwise(Euclidean(), dimred_embedding, dims = 2)
println("distance matrix shape: ", size(d))

hc = hclust(d);

In [None]:
plot(hc)

In [None]:
h_clusters = cutree(hc, k = 9);
print_cluster_sizes(h_clusters)

In [None]:
cluster_indices = Dict(i => findall(x -> x == i, h_clusters) for i in unique(h_clusters));

In [None]:
gr()
cluster_indices = load("analysis/combined/cluster_indices.jld2")["cluster_indices"]
p = plot_dim_red2(embedding, cluster_indices; 
                    dim_red = "UMAP", 
                    xaxis = "UMAP-1", 
                    yaxis = "UMAP-2",
                    xlims = (-9, 18)
                    )
#savefig("analysis/combined/umap.svg")

In [None]:
# reorder clusters
# c_reordered = Dict(1 => cluster_indices[4],
#                      2 => cluster_indices[9],
#                      3 => cluster_indices[2],
#                      4 => cluster_indices[7],
#                      5 => cluster_indices[8],
#                      6 => cluster_indices[3],
#                      7 => cluster_indices[5],
#                      8 => cluster_indices[1],
#                       9 => cluster_indices[6])
# cluster_indices = c_reordered;

In [None]:
plotly()
n = size(embedding, 2)
p = scatter(embedding[1,:], embedding[2,:], 
        markercolor = "slategrey",
        markersize = 5, 
        label = "", 
        xticks = [], 
        yticks = [], 
        framestyle = :box,  
        xlabel = "UMAP-1",
        ylabel = "UMAP-2",
        guidefontsize = 15,
        leftmargin = 5mm,
        size = (450, 350),
        hover = 1:n,
        legend = :topright)

In [None]:
examples = Dict(
1 => [55, 142, 396, 276],
2 => [65, 313, 287, 122],
3 => [30, 334, 80, 30],
4 => [73, 333, 388, 228],
5 => [245, 50, 243, 395],
6 => [116, 26, 148, 129,],
7 => [286, 183, 157, 44],
8 => [47, 259, 392, 62],
9 => [70, 68,244, 193],
)

In [None]:
gr()
figname = "analysis/combined/hierarchical_clusters_representatives_selected_highres.png"
size_unit = 800
size = (4 * size_unit, 10 * size_unit)
plot_PSRH(examples, idx_files, figname; size = size, bottom_margin = 50mm )

Select from example clusters & save

In [None]:
# plot selected examples
# select four random examples
cluster_4 = Dict(i => sample(cluster_indices[i], 4, replace = :false) for i in keys(cluster_indices))

save("analysis/combined/cluster_examples_2.jld2", "cluster_examples", cluster_4)

figname = "analysis/combined/hierarchical_clusters_representatives_random_2.png"
plot_PSRH(cluster_4, idx_files, figname)

In [None]:
# save 
#save("analysis/combined/cluster_indices.jld2", "cluster_indices", cluster_indices)

## UMAP from dimension-0 and dimension-1 features separately

### dimension 0

In [None]:
ECM_PI0 = compute_PI2(ECM_PD0)
cancer_PI0 = compute_PI2(cancer_PD0)
leukocytes_PI0 = compute_PI2(leukocytes_PD0)
CL_PI0 = compute_PI2(CL_PD0)
ECM_C_PI0 = compute_PI2(ECM_C_PD0)
ECM_L_PI0 = compute_PI2(ECM_L_PD0)
all_cells_PI0 = compute_PI2(all_cells_PD0);


In [None]:
# select dictionaries
dicts = [ECM_PI0, 
         cancer_PI0,
        leukocytes_PI0,
        CL_PI0,
        ECM_C_PI0,
        ECM_L_PI0,
        all_cells_PI0,
        ];

# get keys that are present in all dictionaries
all_keys = []
for k in keys(dicts[1])
    present = 0
    for j = 2:length(dicts)
        if k in keys(dicts[j])
            present += 1
        end
    end

    if present == length(dicts) - 1
        push!(all_keys, k)
    end
end

# combine all features
features = Dict()
for f in all_keys
     combined = vcat(ECM_PI0[f], 
                    cancer_PI0[f], 
                    leukocytes_PI0[f], 
                    vec(CL_PI0[f]),
                    vec(ECM_C_PI0[f]),
                    vec(ECM_L_PI0[f]),
                    vec(all_cells_PI0[f]),
                    )
     features[f] = combined
    
end

In [None]:
idx_files = load("analysis/combined/idx_ROI.jld2")["idx_ROI"];
file_idx = Dict(v => k for (k,v) in idx_files);

In [None]:
# prepare features array
n = length(idx_files)
features_array = hcat([features[idx_files[i]] for i = 1:n]...)
println("features array shape: ", size(features_array))

features_centered = features_array .- mean(features_array, dims = 2);

In [None]:
#save("analysis/combined/dim_0/features.jld2", "features", features_array)

In [None]:
Random.seed!(10)
embedding = umap(features_centered, 2; n_neighbors = 5)
#writedlm("analysis/combined/dim_0/umap.csv", embedding, ",")


In [None]:
# load
embedding = Array(CSV.read("analysis/combined/dim_0/umap.csv", header = false))
gr()
n = size(embedding, 2)
p = scatter(embedding[1,:], embedding[2,:], 
        markercolor = "slategrey",
        markersize = 5, 
        label = "", 
        xticks = [], 
        yticks = [], 
        framestyle = :box,  
        xlabel = "UMAP-1",
        ylabel = "UMAP-2",
        guidefontsize = 15,
        leftmargin = 5mm,
        size = (450, 350),
        legend = :topright)
savefig("analysis/combined/dim_0/umap.pdf")
plot(p)

### dimension 1

In [None]:
ECM_PI1 = compute_PI2(ECM_PD1)
cancer_PI1 = compute_PI2(cancer_PD1)
leukocytes_PI1 = compute_PI2(leukocytes_PD1)
CL_PI1 = compute_PI2(CL_PD1)
ECM_C_PI1 = compute_PI2(ECM_C_PD1)
ECM_L_PI1 = compute_PI2(ECM_L_PD1)
all_cells_PI1 = compute_PI2(all_cells_PD1);

In [None]:
# select dictionaries
dicts = [
         ECM_PI1,
        cancer_PI1,
        leukocytes_PI1,
        CL_PI1,
        ECM_C_PI1,
        ECM_L_PI1,
        all_cells_PI1
        ];

# get keys that are present in all dictionaries
all_keys = []
for k in keys(dicts[1])
    present = 0
    for j = 2:length(dicts)
        if k in keys(dicts[j])
            present += 1
        end
    end

    if present == length(dicts) - 1
        push!(all_keys, k)
    end
end

# combine all features
features = Dict()
for f in all_keys
     combined = vcat(
                    vec(ECM_PI1[f]),
                    vec(cancer_PI1[f]),
                    vec(leukocytes_PI1[f]),
                    vec(CL_PI1[f]),
                    vec(ECM_C_PI1[f]),
                    vec(ECM_L_PI1[f]),
                    vec(all_cells_PI1[f])
                    )
     features[f] = combined
    
end

In [None]:
idx_files = load("analysis/combined/idx_ROI.jld2")["idx_ROI"];
file_idx = Dict(v => k for (k,v) in idx_files);

In [None]:
# prepare features array
n = length(idx_files)
features_array = hcat([features[idx_files[i]] for i = 1:n]...)
println("features array shape: ", size(features_array))

features_centered = features_array .- mean(features_array, dims = 2);

In [None]:
#save("analysis/combined/dim_1/features.jld2", "features", features_array)

In [None]:
Random.seed!(10)
embedding = umap(features_centered, 2; n_neighbors = 5)
#writedlm("analysis/combined/dim_1/umap.csv", embedding, ",")


In [None]:
# load
embedding = Array(CSV.read("analysis/combined/dim_1/umap.csv", header = false))
gr()
n = size(embedding, 2)
p = scatter(embedding[1,:], embedding[2,:], 
        markercolor = "slategrey",
        markersize = 5, 
        label = "", 
        xticks = [], 
        yticks = [], 
        framestyle = :box,  
        xlabel = "UMAP-1",
        ylabel = "UMAP-2",
        guidefontsize = 15,
        leftmargin = 5mm,
        size = (450, 350),
        legend = :topright)
savefig("analysis/combined/dim_1/umap.pdf")
plot(p)