# Assign closest cluster
* For each tile of wholeslide image, assign the closest cluster.
* The clusters are computed from the 400 ROIs

In [1]:
include("../src/ECM_TDA.jl")
using .ECM_TDA
include("wholeslide_helper.jl")
using .wholeslide_helper
using Eirene
using Ripserer
using CSV
using TiffImages
using Images
using NPZ
using Plots
using PersistenceDiagrams
using Measures
using DataFrames
using Distances
using Distributions
using MultivariateStats
using LinearAlgebra
using Random
using StatsBase
using JLD2
using FileIO
using PersistenceDiagrams
using DelimitedFiles
using UMAP

│   exception = (LoadError("/Users/irisyoon/.julia/packages/Plots/yJrrq/src/backends/hdf5.jl", 36, UndefVarError(:Group)), Union{Ptr{Nothing}, Base.InterpreterIP}[Ptr{Nothing} @0x00000001012872cf, Ptr{Nothing} @0x000000010131c206, Ptr{Nothing} @0x000000010131d23b, Ptr{Nothing} @0x000000010131bb94, Ptr{Nothing} @0x000000010131bf0c, Base.InterpreterIP in top-level CodeInfo for Plots._hdf5_implementation at statement 4, Ptr{Nothing} @0x0000000101336844, Ptr{Nothing} @0x0000000101335f65, Ptr{Nothing} @0x0000000101336604, Ptr{Nothing} @0x0000000101336604, Ptr{Nothing} @0x0000000101337134, Ptr{Nothing} @0x0000000114d104e7, Ptr{Nothing} @0x0000000101305fbf, Ptr{Nothing} @0x0000000114d61312, Ptr{Nothing} @0x0000000114997a65, Ptr{Nothing} @0x0000000101305fbf, Ptr{Nothing} @0x0000000101312839, Ptr{Nothing} @0x000000010685aa9e, Ptr{Nothing} @0x0000000101305fbf, Ptr{Nothing} @0x000000010131d25f, Ptr{Nothing} @0x000000010131bb94, Ptr{Nothing} @0x000000010131bf0c, Base.InterpreterIP in top-level Cod

In [2]:
gr()

Plots.GRBackend()

# Color patches of wholeslide images according to closest clusters, multiple LTX

In [55]:
LTXs = ["001","013", "021", "034", "041", "050", "051", "073", "079",
        "092", "100", "108", "115", "142", "143", "145", "185", "206", "210", "221"]

gr()


# load ECM features from original ROIs (dataset 1)
idx_ROI_original = load("../data/4000x4000_combined/ECM_PI01_idx_files.jld2")["idx_files"]
features_original = load("../analysis/ECM/combined/features.jld2")["features"]

# load the cluster index (new clusters)
df = DataFrame(CSV.read("cluster_labels_python.csv", header = false ))
clusters_original = Dict(-1 => [], 0 => [], 1 => [], 2 => [], 3 => [], 4 => [], 5 =>[], 6 => [], 7 => [])
for i = 1:size(df,1)
    cluster = Int(df[i, :Column1])
    push!(clusters_original[cluster], i)
end
idx_clusters_original = Dict(j => k for (k,v) in clusters_original for j in v);


# load the PI used in the 400 ROIs (need to use the same limits when computing PI)
PI_ranges = load("../data/4000x4000_combined/ECM_PD/PI_ranges.jld2")

PI0_xmin = PI_ranges["PI0_xmin"]
PI0_xmax = PI_ranges["PI0_xmax"]
PI0_ymin = PI_ranges["PI0_ymin"]
PI0_ymax = PI_ranges["PI0_ymax"]
PI1_xmin = PI_ranges["PI1_xmin"]
PI1_xmax = PI_ranges["PI1_xmax"]
PI1_ymin = PI_ranges["PI1_ymin"]
PI1_ymax = PI_ranges["PI1_ymax"];

for LTX in LTXs
    println(LTX)
    # create directory (if it doesn't already exist)
    isdir("analysis_TDA/LTX" * string(LTX)) || mkdir("analysis_TDA/LTX" * string(LTX))
   
    # load PD
    PD = load("data_TDA/LTX" * LTX * "/PD.jld2")
    PD0 = PD["PD0"]
    PD1 = PD["PD1"]

    # compute PI
    PH0 = Dict(k => ECM_TDA.array_to_ripsererPD(v) for (k,v) in PD0 if v != nothing)
    PH1 = Dict(k => ECM_TDA.array_to_ripsererPD(v) for (k,v) in PD1 if v != nothing)

    PI0 = PersistenceImage((PI0_ymin, PI0_ymax),(PI0_xmin, PI0_xmax), sigma= 50, size = (20,1))
    PI1 = PersistenceImage((PI1_ymin, PI1_ymax),(PI1_xmin, PI1_xmax), sigma= 50, size = (20,20))

    ECM_PI0 = Dict()
    for i in keys(PH0)
        ECM_PI0[i] = PI0(PH0[i])
    end

    ECM_PI1 = Dict()
    for i in keys(PH1)
        ECM_PI1[i] = PI1(PH1[i])
    end

    # combine PI (dim 0, dim 1) features
    features_dict = combine_dim01_PIs(ECM_PI0, ECM_PI1)

    # get idx_ROI dict (of wholeslide data)
    ROIs = collect(keys(features_dict))
    idx_ROI = Dict(i => roi for (i, roi) in enumerate(ROIs));
    ROI_idx = Dict(v => k for (k,v) in idx_ROI)

    # get features array
    n = length(idx_ROI)
    features = hcat([features_dict[idx_ROI[i]] for i = 1:n]...);
    
    # compute distances between original features (data v1) and new features (wholeslide data) 
    D = Distances.pairwise(Euclidean(), features, features_original, dims = 2);

    # compute average features of each cluster
    avg_features_by_clusters = Dict(i => mean(features_original[:,clusters_original[i]], dims = 2) for i in keys(clusters_original))

    # for each tile (of wholeslide image), find closest cluster (according to averaged PI)
    assigned_clusters = find_closest_cluster_for_wholeslide_tiles(features, avg_features_by_clusters)

    # create dictionary of cluster => tile index
    assigned_clusters_rev = Dict(i => [j for j in keys(assigned_clusters) if assigned_clusters[j] == i] for i in keys(clusters_original));


    # save the dictionary of assigned clusters
    df = DataFrame([[],[]], [:coordinates, :cluster])
    for i = 1:length(ROI_idx)
        push!(df, [idx_ROI[i][1:end-4], assigned_clusters[i]])
    end
    CSV.write("analysis_TDA/LTX" * string(LTX) * "/assigned_clusters.csv", df)

    # plot points sampled from wholeslide, colored by closest cluster
    CSV_directory =  "/Volumes/My Passport/wholeslide_sampled_points/" * string(LTX) * "/points_CSV/"
    filename = "analysis_TDA/LTX" * string(LTX) *  "/LTX" * string(LTX) * "_colored_by_clusters.png"
    plot_wholeslide_points_colored_by_clusters2(assigned_clusters, ROI_idx; CSV_directory = CSV_directory, filename = filename, markersize = 3, size = (7000, 7000))
end

001
013
021
034
041
050
051
073
079
092
100
108
115
142
143
145
185
206
210
221
