In [1]:
using Pkg
using CSV 
using DataFrames
using Dates

In [2]:
include("embeddings.jl")

train_plot (generic function with 2 methods)

## Loading Leucegene dataset
### loading complete transcriptomic profile of pronostic subset 

In [3]:
filename = "/u/sauves/leucegene-shared/Data/lgn_pronostic_GE_TRSC_TPM.csv"
#GE_TRSC_TPM = DataFrame(CSV.File(filename))
@time GE_TRSC_TPM = CSV.read(filename, DataFrame)
print()

 15.907817 seconds (39.19 M allocations: 2.384 GiB, 4.67% gc time)


### loading LSC17 expressions only

In [4]:
filename = "/u/sauves/leucegene-shared/Data/SIGNATURES/LSC17_lgn_pronostic_expressions.csv"
LSC17_TPM  = CSV.read(filename, DataFrame)
print()

In [5]:
mutable struct Data
    name::String 
    data
    d1_index::Array{String,1}
    d2_index::Array{String,1}
    d3_index::Array{String,1}
end

In [6]:
data_matrix = Data("LSC17", LSC17_TPM, LSC17_TPM[:,1], names(LSC17_TPM[:,2:end]), LSC17_TPM[:,1])

Data("LSC17", [1m300×18 DataFrame[0m
[1m Row [0m│[1m Column1 [0m[1m ENSG00000174059 [0m[1m ENSG00000104341 [0m[1m ENSG00000128805 [0m[1m ENSG0000009[0m ⋯
[1m     [0m│[90m String7 [0m[90m Float64         [0m[90m Float64         [0m[90m Float64         [0m[90m Float64    [0m ⋯
─────┼──────────────────────────────────────────────────────────────────────────
   1 │ 01H001          1.56679          0.740309         2.04255           0.4 ⋯
   2 │ 02H003          4.86434          2.08587          1.41262           4.9
   3 │ 02H009          1.81276          5.46577          3.21354           4.6
   4 │ 02H017          0.966059         4.17188          2.5548            4.7
   5 │ 02H026          0.251955         3.00906          0.545545          2.5 ⋯
   6 │ 02H033          0.507741         2.62084          1.12384           2.1
   7 │ 02H053          0.050405         2.54472          2.79131           5.1
   8 │ 02H066          1.4689           5.57215          3.6

In [7]:
function prep_data(DF::DataFrame; device = gpu)
    ## data preprocessing
    ### remove index columns, log transform
    data_matrix = log10.(Matrix(DF[:,2:end]) .+ 1)
    
    input_d1_index = DF[:,1]
    input_d2_index = names(DF[:,2:end])
    
    n = length(input_d1_index)
    m = length(input_d2_index)

    values = Array{Float32,2}(undef, (1, n * m))
    print(size(values))
    d1_index = Array{Int32,1}(undef, n * m)
    d2_index = Array{Int32,1}(undef, n * m)
    d3_index = Array{Int32,1}(undef, n * m)
    
    for i in 1:n
        for j in 1:m
            
            index = (i - 1) * m + j 
            if index > 5100 
                print(i,",",j, "\n")
            end
            values[1, index] = data_matrix[i, j]

            d1_index[index] = i 
            d2_index[index] = j
            d3_index[index] = i # random for now
        end

    end
    
    return (device(d1_index), device(d2_index), device(d3_index)), device(values)
end

prep_data (generic function with 2 methods)

## Training Factorized Embedding models on Leucegene
### Experiment 1: training on Leucegene (300 samples) with LSC17 gene expressions.

In [8]:
X, Y = prep_data(LSC17_TPM)


(1, 5100)

((Int32[1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  300, 300, 300, 300, 300, 300, 300, 300, 300, 300], Int32[1, 2, 3, 4, 5, 6, 7, 8, 9, 10  …  8, 9, 10, 11, 12, 13, 14, 15, 16, 17], Int32[1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  300, 300, 300, 300, 300, 300, 300, 300, 300, 300]), Float32[0.4093905 0.24062634 … 0.29476473 0.65913755])

In [15]:
data = Flux.Data.DataLoader((X, Y), batchsize = 4096)
train_plot(data, X,Y, (2,2,2), "embeddings_$(now())", data_matrix, 2000)
## training 

## plotting results 
### scatterplot - predicted expr. vs true 
### training curve - MSE vs epoch
### scatterplot - trained embedding (UMAP) - colors by cyto-group  

Creating folder 'embeddings_2022-06-14T16:45:47.610'


0.0%┣                                            ┫ 0/2.0k [00:00<-1:-40, -0s/it]
0.1%┣                                         ┫ 1/2.0k [00:00<Inf:Inf, InfGs/it]
0.5%┣▏                                           ┫ 10/2.0k [00:00<00:43, 47it/s]
1.0%┣▍                                           ┫ 20/2.0k [00:00<00:29, 68it/s]
1.5%┣▋                                           ┫ 30/2.0k [00:00<00:25, 79it/s]
2.0%┣▉                                           ┫ 40/2.0k [00:00<00:23, 86it/s]
2.5%┣█                                           ┫ 50/2.0k [00:01<00:22, 90it/s]
3.0%┣█▎                                          ┫ 60/2.0k [00:01<00:21, 93it/s]
3.5%┣█▌                                          ┫ 70/2.0k [00:01<00:22, 88it/s]
4.0%┣█▊                                          ┫ 80/2.0k [00:01<00:21, 91it/s]
4.5%┣██                                          ┫ 90/2.0k [00:01<00:21, 93it/s]
5.0%┣██▏                                        ┫ 100/2.0k [00:01<00:20, 95it/s]
5.5%┣██▍                    

Generating dim1 plot...


100.0%┣███████████████████████████████████████┫ 2.0k/2.0k [00:18<00:00, 111it/s]
100.0%┣███████████████████████████████████████┫ 2.0k/2.0k [00:18<00:00, 111it/s]


Generating dim2 plot...
Generating molecule_layer plot...


0.9279642f0

In [14]:
data_matrix.d1_index

300-element Vector{String}:
 "01H001"
 "02H003"
 "02H009"
 "02H017"
 "02H026"
 "02H033"
 "02H053"
 "02H066"
 "03H016"
 "03H022"
 "03H024"
 "03H028"
 "03H036"
 ⋮
 "13H179"
 "13H185"
 "13H186"
 "14H001"
 "14H007"
 "14H012"
 "14H015"
 "14H017"
 "14H019"
 "14H020"
 "14H023"
 "14H038"