# Dowker PH computation
* This notebook computes Dowker PH features.


In [None]:
include("../src/ECM_TDA.jl")
using .ECM_TDA

using Eirene
using Ripserer
using CSV
using TiffImages
using Images
using NPZ
using Plots
using PersistenceDiagrams
using Measures
using Distributions
using MultivariateStats
using LinearAlgebra
using Random
using StatsBase
using JLD2
using FileIO
using DelimitedFiles
using Distances
using UMAP


In [None]:
# assign colors to cell types
c_ECM = "#259ea1"
c_cancer = "#582793"
c_leukocytes = "#f0a53a"

celltype_colors = Dict("cancer" => c_cancer, 
                        "leukocytes"=> c_leukocytes);

# Subsample points (Example)
* Before we compute Dowker features, we need to subsample the points (due to speed issues)
* Sampled cells are saved in `data/4000x4000/subregion_cells/cells_sampled`
* If there are more than 400 points of a single cell type, then we downsample. 
* Two sampling methods are used
    (a) sample uniformly 
    (b) sample using KDE

<b> load example </b>

In [None]:
LTX = "145"
Da = "318"
idx = 4

# load ECM image
image_path = "data/4000x4000/subregion_ECM/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".tif"
img = Array(Images.load(image_path));

# load sampled ECM
ecm_points_path = "data/4000x4000/ECM_sampled/points_CSV/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv" 
ECM_points = CSV.read(ecm_points_path)

# load cells
cell_path = "data/4000x4000/subregion_cells/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv"
cells = CSV.read(cell_path);

In [None]:
gr()
p1 = plot(Gray.(img), frame = :box, ticks = [], size = (300, 300), background_color=:transparent, foreground_color=:black)
p2 = scatter(ECM_points[:,1], ECM_points[:,2], yflip = :true, c = c_ECM, label = "", frame = :box, ticks = [], markersize = 2, background_color=:transparent, foreground_color=:black)
plot(p1, p2, size = (600, 300))

In [None]:
cell_types =["cancer", "leukocytes", "fibroblast"]
plot_array = Any[]
for ct in cell_types
    # get locations of cell type
    cell_ct = cells[cells.class .== ct, :]
    c = celltype_colors[ct]
    p_ct = scatter(cell_ct.x, cell_ct.y,
                 markersize = 2,
                 yflip = true,
                 label = ct,
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c) 
    push!(plot_array, p_ct)
end

plot(plot_array..., layout = grid(1,3), size = (900, 300))

## (a) sample uniformly

In [None]:
plot_array = Any[]
for ct in cell_types
    # get specific cells
    cell_df = cells[cells.class .== ct, :]
    c = celltype_colors[ct]
    
    if ct == "cancer"
        subsample_size = 1000
    else
        subsample_size = 400
    end
    sampled = sample_uniform(cell_df; subsample_size = subsample_size)
    p_ct = scatter(sampled.x, sampled.y,
                 markersize = 2,
                 yflip = true,
                 label = ct,
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c) 
    push!(plot_array, p_ct)
    
end
plot(plot_array..., layout = grid(1,3), size = (900, 300))

## (b) sample using KDE

In [None]:
plot_array = Any[]
for ct in cell_types
    # get specific cells
    cell_df = cells[cells.class .== ct, :]
    c = celltype_colors[ct]
    if ct == "cancer"
        subsample_size = 1000
    else
        subsample_size = 400
    end
    sampled, _ = sample_kde(cell_df; subsample_size = subsample_size)
    p_ct = scatter(sampled.x, sampled.y,
                 markersize = 2,
                 yflip = true,
                 label = ct,
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c) 
    push!(plot_array, p_ct)
    
end

plot(plot_array..., layout = grid(1,3), size = (900, 300))

In [None]:
# example plot of KDE from cancer
cell_df = cells[cells.class .== "cancer", :]
sampled, kdf_cancer = sample_kde(cell_df; subsample_size = 1000);


In [None]:
plot(kdf_cancer, yflip = :true)

# Determine number of points to downsample
* Examine hisotgram of cells
* Determine the function from original number of cells to number of sampled points

In [None]:
# get csv files
dir = "data/4000x4000_combined/subregion_cells/"
csv_files = [item for item in walkdir(dir)][1][3:end][1]
cell_types =["cancer", "leukocytes"]

# for each file, get number of cells
n_cancer = []
n_leukocytes = []
for (idx, file) in enumerate(csv_files)

    cells = CSV.read(dir * file)
    filename = split(file, ".")[1]
    
    for ct in cell_types
        # get specific cells
        cell_df = cells[cells.class .== ct, :]
        n_cells = size(cell_df, 1)
        if ct == "cancer"
            append!(n_cancer, n_cells)
        elseif ct == "leukocytes"
            append!(n_leukocytes, n_cells)

        end 
    end 
end

In [None]:
function n_samples(x, low, high, max_sample)
    if x <= low
        return x
    elseif x <= high
        slope = (max_sample - low) / (high - low)
        return slope * (x - low) + low
    else
        return max_sample
    end
end

## (a) ECM

In [None]:
# get number of points in ECM
dir = "data/4000x4000_combined/ECM_sampled/points_CSV/"
csv_files = [item for item in walkdir(dir)][1][3:end][1]

# for each file, get number of ECM points
n_ECM = []
for (idx, file) in enumerate(csv_files)

    cells = CSV.read(dir * file)
    filename = split(file, ".")[1]
    
    n_cells = size(cells, 1)
    append!(n_ECM, n_cells)
end

# get 1%, 50%, 99% values
ECM_low = percentile(n_ECM, 1)
ECM_med = percentile(n_ECM, 50)
ECM_high = percentile(n_ECM, 99)
println("ECM_low: ", ECM_low)
println("ECM_med: ", ECM_med)
println("ECM_high: ", ECM_high)

In [None]:
gr()

In [None]:
# get 1%, 50%, 99% values
ECM_low = percentile(n_ECM, 1)
ECM_med = percentile(n_ECM, 50)
ECM_high = percentile(n_ECM, 99)

# plot histogram
p = histogram(n_ECM, label = "", 
    xlabel = "# ECM points", ylabel = "frequency",
    guidefontsize = 15, xtickfontsize = 12, ytickfontsize = 12, legendfontsize = 12,
    color = "grey77",
    size = (500, 350))
vline!(p, [ECM_low, ECM_low], label = "1%", linewidth = 5, color = "coral2")
vline!(p, [ECM_high, ECM_high], label = "99%", linewidth = 5, color = "lightseagreen")
savefig("data/4000x4000_combined/Dowker/ECM_histogram.svg")
plot(p)

In [None]:
max_sample = 3000

x = range(0, 6000, length = 10000)
y = n_samples.(x, ECM_low, ECM_high, max_sample)
p = plot(x, y, label = "", 
        xlabel = "# ECM points", ylabel ="downsampled # ECM points",
        color = "grey",
        linewidth = 5,
        topmargin = 4mm,
        legend = :bottomright,
        guidefontsize = 15, xtickfontsize = 12, ytickfontsize = 12, legendfontsize = 12,
        size = (500, 350),
        )
vline!(p, [ECM_low, ECM_low], label = "1%", linewidth = 5, color = "coral2")
vline!(p, [ECM_high, ECM_high], label = "99%", linewidth = 5, color = "lightseagreen")
savefig("data/4000x4000_combined/Dowker/n_ECM_function.svg")
plot(p)

## (b) Cancer cells

In [None]:
# get 1%, 50%, 99% values
cancer_low = percentile(n_cancer, 1)
cancer_med = percentile(n_cancer, 50)
cancer_high = percentile(n_cancer, 99)

println("cancer_low: ", cancer_low)
println("cancer_med: ", cancer_med)
println("cancer_high: ", cancer_high)

In [None]:
# get 1%, 50%, 99% values
cancer_low = percentile(n_cancer, 1)
cancer_med = percentile(n_cancer, 50)
cancer_high = percentile(n_cancer, 99)

# plot histogram
p = histogram(n_cancer, label = "", 
    xlabel = "# cancer cells", ylabel = "frequency",
    guidefontsize = 15, xtickfontsize = 12, ytickfontsize = 12, legendfontsize = 12,
    color = "grey77",
    size = (500, 350))
vline!(p, [cancer_low, cancer_low], label = "1%", linewidth = 5, color = "coral2")
vline!(p, [cancer_high, cancer_high], label = "99%", linewidth = 5, color = "lightseagreen")
savefig("data/4000x4000_combined/Dowker/cancer_histogram.svg")
plot(p)

In [None]:
max_sample = 1000

x = range(0, 6000, length = 10000)
y = n_samples.(x, cancer_low, cancer_high, max_sample)
p = plot(x, y, label = "", 
        xlabel = "# cancer cells", ylabel ="downsampled # cancer cells",
        color = "grey",
        linewidth = 5,
        legend = :bottomright,
        topmargin = 4mm,
        guidefontsize = 15, xtickfontsize = 12, ytickfontsize = 12, legendfontsize = 12,
        size = (500, 350),
        )
vline!(p, [cancer_low, cancer_low], label = "1%", linewidth = 5, color = "coral2")
vline!(p, [cancer_high, cancer_high], label = "99%", linewidth = 5, color = "lightseagreen")
savefig("data/4000x4000_combined/Dowker/n_cancer_function.svg")
plot(p)

## (c) Leukocytes

In [None]:
# get 1%, 50%, 99% values
L_low = percentile(n_leukocytes, 1)
L_med = percentile(n_leukocytes, 50)
L_high = percentile(n_leukocytes, 99)


println("L_low: ", L_low)
println("L_med: ", L_med)
println("L_high: ", L_high)

In [None]:
# get 1%, 50%, 99% values
L_low = percentile(n_leukocytes, 1)
L_med = percentile(n_leukocytes, 50)
L_high = percentile(n_leukocytes, 99)

# plot histogram
p = histogram(n_leukocytes, label = "", 
    xlabel = "# leukocytes", ylabel = "frequency",
    guidefontsize = 15, xtickfontsize = 12, ytickfontsize = 12, legendfontsize = 12,
    color = "grey77",
    size = (500, 350))
vline!(p, [L_low, L_low], label = "1%", linewidth = 5, color = "coral2")
vline!(p, [L_high, L_high], label = "99%", linewidth = 5, color = "lightseagreen")
savefig("data/4000x4000_combined/Dowker/leukocytes_histogram.svg")
plot(p)

In [None]:
max_sample = 500

x = range(0, 5500, length = 10000)
y = n_samples.(x, L_low, L_high, max_sample)
p = plot(x, y, label = "", 
        xlabel = "# leukocytes", ylabel ="downsampled # leukocytes",
        color = "grey",
        linewidth = 5,
        legend = :bottomright,
        topmargin = 4mm,
        guidefontsize = 15, xtickfontsize = 12, ytickfontsize = 12, legendfontsize = 12,
        size = (500, 350),
        )
vline!(p, [L_low, L_low], label = "1%", linewidth = 5, color = "coral2")
vline!(p, [L_high, L_high], label = "99%", linewidth = 5, color = "lightseagreen")
savefig("data/4000x4000_combined/Dowker/n_leukocyte_function.svg")
plot(p)

# Subsample points from ECM, cancer cells, luekocytes

## (a) Subsample ECM points

In [None]:
# get number of points in ECM
dir = "data/4000x4000_combined/ECM_sampled/points_CSV/"
csv_files = [item for item in walkdir(dir)][1][3:end][1]
ECM_max_sample = 3000

# for each file, sample cells
# for each file, sample cells
for file in csv_files

    cells = CSV.read(dir * file)
    filename = split(file, ".")[1]
    
    n = size(cells, 1)

    subsample_size = Int64(round(n_samples(n, ECM_low, ECM_high, ECM_max_sample )))
    # sample points
    sampled, _ = sample_kde(cells; subsample_size = subsample_size)

    # save
    writedlm("data/4000x4000_combined/Dowker/ECM/" * filename * ".csv", Array(sampled[:, [:x, :y]]), ",")

end

## (b) Subsample cancer, luekocytes

In [None]:
# get csv files
dir = "data/4000x4000_combined/subregion_cells/"
csv_files = [item for item in walkdir(dir)][1][3:end][1]
cell_types = ["cancer", "leukocytes"]
C_max_sample = 1000
L_max_sample = 500

# for each file, sample cells
for (idx, file) in enumerate(csv_files)

    cells = CSV.read(dir * file)
    filename = split(file, ".")[1]
    
    for ct in cell_types
        # get specific cells
        cell_df = cells[cells.class .== ct, :]
        
        # get number of cells to sample
        n = size(cell_df, 1)
        if ct == "cancer"
            subsample_size = Int64(round(n_samples(n, cancer_low, cancer_high, C_max_sample)))
        elseif ct == "leukocytes"
            subsample_size = Int64(round(n_samples(n, L_low, L_high, L_max_sample)))
        end
        # sample points
        sampled, _ = sample_kde(cell_df; subsample_size = subsample_size)
        
        # save
        writedlm("data/4000x4000_combined/Dowker/" * ct * "/" * filename * ".csv", Array(sampled[:, [:x, :y]]), ",")
    end 
end

## (c) Plot example images before and after sampling

### ECM

In [None]:
# get number of points in ECM
dir = "data/4000x4000_combined/ECM_sampled/points_CSV/"
csv_files = [item for item in walkdir(dir)][1][3:end][1]

# for each file, get number of ECM points
n_ECM = []
for (idx, file) in enumerate(csv_files)

    cells = CSV.read(dir * file)
    filename = split(file, ".")[1]
    
    n_cells = size(cells, 1)
    append!(n_ECM, n_cells)
end

# get 1%, 50%, 99% values
ECM_low = percentile(n_ECM, 1.5)
ECM_med = percentile(n_ECM, 50)
ECM_high = percentile(n_ECM, 99)


# find image with mean pixel value close to "low"
idx_low = index_closest_to_x(n_ECM, ECM_low)

# find image with mean pixel value close to "high"
high = percentile(n_cancer, 99.5)
idx_high = index_closest_to_x(n_ECM, ECM_high)

# find image with mean pixel value around median
idx_median = index_closest_to_x(n_ECM, ECM_med);

In [None]:
println("low: ", csv_files[idx_low])
println("median: ", csv_files[idx_median])
println("high: ", csv_files[idx_high])

# print info
println("low")
println("original number of points: ", n_ECM[idx_low])
println("number of points sampled: ", Int64(round(n_samples(n_ECM[idx_low], ECM_low, ECM_high, ECM_max_sample))))

println("median")
println("original number of points: ", n_ECM[idx_median])
println("number of points sampled: ", Int64(round(n_samples(n_ECM[idx_median], ECM_low, ECM_high, ECM_max_sample))))

println("high")
println("original number of points: ", n_ECM[idx_high])
println("number of points sampled: ", Int64(round(n_samples(n_ECM[idx_high], ECM_low, ECM_high, ECM_max_sample))))

In [None]:
# plot original ECM points and downsampled points
gr()
examples = [("092", "523", 4), ("021", "565", 2), ("097", "113", 1)]
plot_array = []
# original
for ex in examples
    LTX = ex[1]
    Da = ex[2]
    idx = ex[3]
    
    # load cells
    cell_path = "data/4000x4000_combined/ECM_sampled/points_CSV/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv"
    cells = CSV.read(cell_path)

    # get locations of cell type
    p1 = scatter(cells.x, cells.y,
                 markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 size = (300, 300),
                 c = c_ECM) 
    
    
    # load cells
    cell_path = "data/4000x4000_combined/Dowker/ECM/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv"
    cell = CSV.read(cell_path, header = false)
    # get locations of cell type
    p2 = scatter(cell[:,1], cell[:,2],
                 markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 size = (300, 300),
                 c = c_ECM) 
    push!(plot_array, p1)
    push!(plot_array, p2)   
end

p = plot(plot_array..., layout = grid(3,2), size = (600, 900))
savefig("data/4000x4000_combined/Dowker/downsample_ECM.svg")
plot(p)

### Cancer

In [None]:
# find image with mean pixel value close to "low"
idx_low = index_closest_to_x(n_cancer, cancer_low)

# find image with mean pixel value close to "high"
high = percentile(n_cancer, 99.5)
idx_high = index_closest_to_x(n_cancer, cancer_high)

# find image with mean pixel value around median
med = median(n_cancer)
idx_median = index_closest_to_x(n_cancer, cancer_med);

In [None]:
println("low: ", csv_files[idx_low])
println("median: ", csv_files[idx_median])
println("high: ", csv_files[idx_high])

# print info
println("low")
println("original number of points: ", n_cancer[idx_low])
println("number of points sampled: ", Int64(round(n_samples(n_cancer[idx_low], cancer_low, cancer_high, C_max_sample))))

println("median")
println("original number of points: ", n_cancer[idx_median])
println("number of points sampled: ", Int64(round(n_samples(n_cancer[idx_median], cancer_low, cancer_high, C_max_sample))))

println("high")
println("original number of points: ", n_cancer[idx_high])
println("number of points sampled: ", Int64(round(n_samples(n_cancer[idx_high], cancer_low, cancer_high, C_max_sample))))

In [None]:
# plot original cancer
gr()
examples = [("079", "108", 2), ("082", "161", 3), ("210", "691", 3)]
plot_array = []
# original
for ex in examples
    LTX = ex[1]
    Da = ex[2]
    idx = ex[3]
    
    # load cells
    cell_path = "data/4000x4000_combined/subregion_cells/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv"
    cells = CSV.read(cell_path);

    # get locations of cell type
    cell_ct = cells[cells.class .== "cancer", :]
    c = celltype_colors["cancer"]
    p1 = scatter(cell_ct.x, cell_ct.y,
                 markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 size = (300, 300),
                 c = c) 
    
    
    # load cells
    cell_path = "data/4000x4000_combined/Dowker/cancer/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv"
    cell_ct = CSV.read(cell_path, header = false)
    # get locations of cell type
    c = celltype_colors["cancer"]
    p2 = scatter(cell_ct[:,1], cell_ct[:,2],
                 markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 size = (300, 300),
                 c = c) 
    push!(plot_array, p1)
    push!(plot_array, p2)   
end

p = plot(plot_array..., layout = grid(3,2), size = (600, 900))
savefig("data/4000x4000_combined/Dowker/downsample_cancer.svg")
plot(p)

### Leukocytes

In [None]:
# find image with mean pixel value close to "low"
idx_low = index_closest_to_x(n_leukocytes, L_low)

# find image with mean pixel value close to "high"
high = percentile(n_leukocytes, 99.5)
idx_high = index_closest_to_x(n_leukocytes, L_high)

# find image with mean pixel value around median
med = median(n_leukocytes)
idx_median = index_closest_to_x(n_leukocytes, L_med);

println("low: ", csv_files[idx_low])
println("median: ", csv_files[idx_median])
println("high: ", csv_files[idx_high])

# print info
println("low")
println("original number of points: ", n_leukocytes[idx_low])
println("number of points sampled: ", Int64(round(n_samples(n_leukocytes[idx_low], L_low, L_high, L_max_sample))))

println("median")
println("original number of points: ", n_leukocytes[idx_median])
println("number of points sampled: ", Int64(round(n_samples(n_leukocytes[idx_median], L_low, L_high, L_max_sample))))

println("high")
println("original number of points: ", n_leukocytes[idx_high])
println("number of points sampled: ", Int64(round(n_samples(n_leukocytes[idx_high], L_low, L_high, L_max_sample))))

In [None]:
# plot original cancer
examples = [("156", "195", 2), ("115", "164", 2), ("092", "378", 1)]

gr()
plot_array = []
# original
for ex in examples
    LTX = ex[1]
    Da = ex[2]
    idx = ex[3]
    
    # load cells
    cell_path = "data/4000x4000/subregion_cells/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv"
    cells = CSV.read(cell_path);

    # get locations of cell type
    cell_ct = cells[cells.class .== "leukocytes", :]
    c = celltype_colors["leukocytes"]
    p1 = scatter(cell_ct.x, cell_ct.y,
                 markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 size = (300, 300),
                 c = c) 
    
    
    # load cells
    cell_path = "data/4000x4000/cells_sampled/leukocytes/LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv"
    cell_ct = CSV.read(cell_path, header = false)
    # get locations of cell type
    c = celltype_colors["leukocytes"]
    p2 = scatter(cell_ct[:,1], cell_ct[:,2],
                 markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 size = (300, 300),
                 c = c) 
    push!(plot_array, p1)
    push!(plot_array, p2)
end
p = plot(plot_array..., layout = grid(3,2), size = (600, 900))
savefig("data/4000x4000_combined/Dowker/downsample_leukocytes.svg")
plot(p)


# Example Dowker persistence diagram computation

## (a) among cells

In [None]:
# specify example
LTX = "145"
Da = "318"
idx = 4

# load cells
dir = "data/4000x4000_combined/Dowker"
filename = "LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv" 
cells_cancer = readdlm(dir * "/cancer/" * filename, ',')
cells_leukocytes = readdlm(dir * "/leukocytes/" * filename, ',');

In [None]:
# plot
p1 = scatter(cells_cancer[:,1], cells_cancer[:,2], markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c_cancer)
p2 = scatter(cells_leukocytes[:,1], cells_leukocytes[:,2], markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c_leukocytes)
p3 = scatter(cells_cancer[:,1], cells_cancer[:,2], markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c_cancer)
scatter!(p3, cells_leukocytes[:,1], cells_leukocytes[:,2], markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c_leukocytes)

plot(p1, p2,p3,  layout = grid(1,3), size = (900, 300))

In [None]:
W_barcode0, W_barcode1, d = compute_Dowker(cells_leukocytes, cells_cancer)

## (b) Between ECM and cell

In [None]:
# specify example
LTX = "160"
Da = "328"
idx = 2

# load cells
dir = "data/4000x4000_combined/Dowker/"
filename = "LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv" 
cells_cancer = readdlm(dir * "/cancer/" * filename, ',')
cells_leukocytes = readdlm(dir * "/leukocytes/" * filename, ',')
cells_ECM = readdlm("data/4000x4000_combined/Dowker/ECM/" * filename, ',');

# load subregions
subregion_centers = load("data/4000x4000/subregion_centers.jld2")["subregion_centers"];
subregion_centers_green = load("data/4000x4000_201222/subregion_centers_green.jld2")["subregion_centers_green"];
subregion_centers_purple = load("data/4000x4000_201222/subregion_centers_purple.jld2")["subregion_centers_purple"];
subregion_all = merge(subregion_centers, subregion_centers_green, subregion_centers_purple);

In [None]:
center_x, center_y = subregion_all[(LTX,Da)][idx]
subregion_size = 4000
xmin, xmax, ymin, ymax = get_subregion_boundaries(center_x, center_y, subregion_size)
n = size(cells_cancer, 1)
cells_cancer_new = cells_cancer .- hcat(ones(n) * ymin, ones(n) * xmin);

In [None]:
gr()
p1 = scatter(cells_cancer_new[:,1], cells_cancer_new[:,2], markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c_cancer)

p2 = scatter(cells_ECM[:,1], cells_ECM[:,2], yflip = :true, c = c_ECM, label = "", frame = :box, ticks = [], markersize = 2, background_color=:transparent, foreground_color=:black)
p3 = scatter(cells_cancer_new[:,1], cells_cancer_new[:,2], markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c_cancer)
scatter!(p3, cells_ECM[:,1], cells_ECM[:,2], yflip = :true, c = c_ECM, label = "", frame = :box, ticks = [], markersize = 2, background_color=:transparent, foreground_color=:black)
plot(p1, p2, p3, layout = grid(1,3), size = (900, 300))

## (c) Between ECM and leukocytes

In [None]:
# specify example
LTX = "145"
Da = "2441"
idx = 2

# load cells
dir = "data/4000x4000_combined/Dowker/"
filename = "LTX" * LTX * "_Da" * Da * "_idx" * string(idx) * ".csv" 
cells_cancer = readdlm(dir * "/cancer/" * filename, ',')
cells_leukocytes = readdlm(dir * "/leukocytes/" * filename, ',')
cells_ECM = readdlm("data/4000x4000_combined/Dowker/ECM/" * filename, ',');

# load subregions
subregion_centers = load("data/4000x4000/subregion_centers.jld2")["subregion_centers"];
subregion_centers_green = load("data/4000x4000_201222/subregion_centers_green.jld2")["subregion_centers_green"];
subregion_centers_purple = load("data/4000x4000_201222/subregion_centers_purple.jld2")["subregion_centers_purple"];
subregion_all = merge(subregion_centers, subregion_centers_green, subregion_centers_purple);

In [None]:
center_x, center_y = subregion_all[(LTX,Da)][idx]
subregion_size = 4000
xmin, xmax, ymin, ymax = get_subregion_boundaries(center_x, center_y, subregion_size)
n = size(cells_leukocytes, 1)
cells_leukocytes_new = cells_leukocytes .- hcat(ones(n) * ymin, ones(n) * xmin);

In [None]:
gr()
p1 = scatter(cells_leukocytes_new[:,1], cells_leukocytes_new[:,2], markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c_leukocytes)

p2 = scatter(cells_ECM[:,1], cells_ECM[:,2], yflip = :true, c = c_ECM, label = "", frame = :box, ticks = [], markersize = 2, background_color=:transparent, foreground_color=:black)
p3 = scatter(cells_leukocytes_new[:,1], cells_leukocytes_new[:,2], markersize = 2,
                 yflip = true,
                 label = "",
                 markerstrokewidth = 0.2,
                 frame = :box,
                 ticks = [],
                 c = c_leukocytes)
scatter!(p3, cells_ECM[:,1], cells_ECM[:,2], yflip = :true, c = c_ECM, label = "", frame = :box, ticks = [], markersize = 2, background_color=:transparent, foreground_color=:black)
plot(p1, p2, p3, layout = grid(1,3), size = (900, 300))

# Compute Dowker persistence diagrams for all pairs among: ECM, cancer cells, leukocytes

## Between cancer and leukocytes

In [None]:
cell_dir = "data/4000x4000_combined/Dowker/"
csv_files = [item for item in walkdir(cell_dir * "cancer/")][1][3:end][1]
for (idx, file) in enumerate(csv_files)

    # load sampled cells
    cells_leukocytes = readdlm(cell_dir * "leukocytes/" * file, ',')
    cells_cancer = readdlm(cell_dir * "cancer/" * file, ',')

    W_barcode0, W_barcode1, _ = compute_Dowker(cells_cancer, cells_leukocytes)
   
    # save
    if W_barcode0 == nothing
        writedlm("data/4000x4000_combined/Dowker/cancer_leukocytes/PD0/" * file, zeros(), ",")    
    else
        writedlm("data/4000x4000_combined/Dowker/cancer_leukocytes/PD0/" * file, W_barcode0, ",")    
    end
    
    if W_barcode1 == nothing
        writedlm("data/4000x4000_combined/Dowker/cancer_leukocytes/PD1/" * file, zeros(), ",")
    else
        writedlm("data/4000x4000_combined/Dowker/cancer_leukocytes/PD1/" * file, W_barcode1, ",")
    end
end

## Between ECM and leukocytes

In [None]:
idx = 255
file = csv_files[idx]
LTX = split(file, "_")[1][4:end]
Da = split(file, "_")[2][3:end]
idx = parse(Int64, split(split(file,"_")[3], ".")[1][4:end])

# load sampled cells
cells_leukocytes = readdlm(cell_dir  * file, ',')
try
    cells_ECM = readdlm(ECM_dir * file, ',')
#cells_ECM = Float64.(cells_ECM[2:end,:])
catch e
end

# adjust the coordinates of cells to align with ECM
"""
center_x, center_y = subregion_all[(LTX,Da)][idx]
xmin, xmax, ymin, ymax = get_subregion_boundaries(center_x, center_y, subregion_size)
n = size(cells_leukocytes, 1)
cells_leukocytes_new = cells_leukocytes .- hcat(ones(n) * ymin, ones(n) * xmin);
W_barcode0, W_barcode1, _ = compute_Dowker(cells_ECM, cells_leukocytes_new)
"""

In [None]:
cell_dir = "data/4000x4000_combined/Dowker/leukocytes/"
ECM_dir = "data/4000x4000_combined/Dowker/ECM/"
csv_files = [item for item in walkdir(ECM_dir)][1][3:end][1]

# load subregions
subregion_centers = load("data/4000x4000/subregion_centers.jld2")["subregion_centers"];
subregion_centers_green = load("data/4000x4000_201222/subregion_centers_green.jld2")["subregion_centers_green"];
subregion_centers_purple = load("data/4000x4000_201222/subregion_centers_purple.jld2")["subregion_centers_purple"];
subregion_all = merge(subregion_centers, subregion_centers_green, subregion_centers_purple);
subregion_size = 4000

for idx = 255:length(csv_files)
#for (idx, file) in enumerate(csv_files)
#    println(idx, file)
    file = csv_files[idx]
    # get LTX, Da, idx
    LTX = split(file, "_")[1][4:end]
    Da = split(file, "_")[2][3:end]
    idx = parse(Int64, split(split(file,"_")[3], ".")[1][4:end])

    # load sampled cells
    cells_leukocytes = readdlm(cell_dir  * file, ',')
    try
        cells_ECM = readdlm(ECM_dir * file, ',')
        #cells_ECM = Float64.(cells_ECM[2:end,:])

        # adjust the coordinates of cells to align with ECM
        center_x, center_y = subregion_all[(LTX,Da)][idx]
        xmin, xmax, ymin, ymax = get_subregion_boundaries(center_x, center_y, subregion_size)
        n = size(cells_leukocytes, 1)
        cells_leukocytes_new = cells_leukocytes .- hcat(ones(n) * ymin, ones(n) * xmin);
        W_barcode0, W_barcode1, _ = compute_Dowker(cells_ECM, cells_leukocytes_new)

        # save
        if W_barcode0 == nothing
            writedlm("data/4000x4000_combined/Dowker/ECM_leukocytes/PD0/" * file, zeros(), ",")
        else
            writedlm("data/4000x4000_combined/Dowker/ECM_leukocytes/PD0/" * file, W_barcode0, ",")
        end

        if W_barcode1 == nothing
            writedlm("data/4000x4000_combined/Dowker/ECM_leukocytes/PD1/" * file, zeros(), ",")
        else
            writedlm("data/4000x4000_combined/Dowker/ECM_leukocytes/PD1/" * file, W_barcode1, ",")
        end
    catch e
    end
end

## Between ECM and cancer 
be careful to scale the coordinates

In [None]:
cell_dir = "data/4000x4000_combined/Dowker/cancer/"
ECM_dir = "data/4000x4000_combined/Dowker/ECM/"
csv_files = [item for item in walkdir(ECM_dir)][1][3:end][1]

# load subregions
subregion_centers = load("data/4000x4000/subregion_centers.jld2")["subregion_centers"];
subregion_centers_green = load("data/4000x4000_201222/subregion_centers_green.jld2")["subregion_centers_green"];
subregion_centers_purple = load("data/4000x4000_201222/subregion_centers_purple.jld2")["subregion_centers_purple"];
subregion_all = merge(subregion_centers, subregion_centers_green, subregion_centers_purple);

subregion_size = 4000

for (idx, file) in enumerate(csv_files)
    file = csv_files[idx]
    # get LTX, Da, idx
    LTX = split(file, "_")[1][4:end]
    Da = split(file, "_")[2][3:end]
    idx = parse(Int64, split(split(file,"_")[3], ".")[1][4:end])

    # load sampled cells
    cells_cancer = readdlm(cell_dir  * file, ',')
    try
        cells_ECM = readdlm(ECM_dir * file, ',')

        # adjust the coordinates of cancer cells to align with ECM
        center_x, center_y = subregion_all[(LTX,Da)][idx]
        xmin, xmax, ymin, ymax = get_subregion_boundaries(center_x, center_y, subregion_size)
        n = size(cells_cancer, 1)
        cells_cancer_new = cells_cancer .- hcat(ones(n) * ymin, ones(n) * xmin);

        W_barcode0, W_barcode1, _ = compute_Dowker(cells_ECM, cells_cancer_new)

        # save
        if W_barcode0 == nothing
            writedlm("data/4000x4000_combined/Dowker/ECM_cancer/PD0/" * file, zeros(), ",")
            writedlm("data/4000x4000_combined/Dowker/ECM_cancer/PD1/" * file, zeros(), ",")

        else
            writedlm("data/4000x4000_combined/Dowker/ECM_cancer/PD0/" * file, W_barcode0, ",")
            writedlm("data/4000x4000_combined/Dowker/ECM_cancer/PD1/" * file, W_barcode1, ",")

        end
    catch e
    end
end

In [None]:
length(csv_files)

# Compute persistence image features from Dowker persistence diagrams

In [None]:
get_PD0_max2(PD_dict) = maximum([sort(hcat(PD_dict[i]...), dims = 1)[end-1] for (i,v) in PD_dict if v != reshape(Array([0.0]), 1, 1) ])

In [None]:
s = "ECM_cancer" # choose one of the following: "cancer_leukocytes", "ECM_cancer", "ECM_leukocytes"
dir = "data/4000x4000_combined/Dowker/" * s * "/PD0/"
csv_files = [item for item in walkdir(dir)][1][3:end][1]

# load PD
PD0 = Dict()
PD1 = Dict()
for (idx, file) in enumerate(csv_files)
    ROI = file[1:end-4]
    PD0[ROI] = readdlm("data/4000x4000_combined/Dowker/" * s * "/PD0/" * file, ',')
    
    # check if the W1 is empty
    if filesize("data/4000x4000_combined/Dowker/" * s * "/PD1/" * file) == 0
        PD1[ROI] = reshape(Array([0.0]), 1, 1)
    else
        PD1[ROI] = readdlm("data/4000x4000_combined/Dowker/" * s * "/PD1/" * file, ',')
    end
end

# convert to Ripser PD
PH0 = Dict(k => ECM_TDA.array_to_ripsererPD(v) for (k,v) in PD0 if v != reshape(Array([0.0]), 1, 1))
PH1 = Dict(k => ECM_TDA.array_to_ripsererPD(v) for (k,v) in PD1 if v != reshape(Array([0.0]), 1, 1))

# compute PI
PI0 = ECM_TDA.compute_PI(PH0)
PI1 = ECM_TDA.compute_PI(PH1);

# compute maximum PD values (for plotting)
PD0_max = get_PD0_max2(PD0) # not a typo. This is because Dowker PD0 doesn't just end with one connected component
PD1_max = get_PD1_max(PD1);


# save("data/4000x4000_combined/Dowker/" * s * "/PD.jld2", 
#     "PD0", PD0, 
#     "PD1", PD1, 
#     "PI0", PI0, 
#     "PI1", PI1,
#     "PD0_max", PD0_max,
#     "PD1_max", PD1_max)