In [37]:
using Pkg
using CSV 
using DataFrames
using Dates
using CUDA
using Flux

In [4]:
using Statistics
using Random

In [5]:
filename = "/u/sauves/leucegene-shared/Data/LEUCEGENE/lgn_pronostic_GE_CDS_TPM.csv"
#GE_TRSC_TPM = DataFrame(CSV.File(filename))
@time GE_CDS_TPM = CSV.read(filename, DataFrame)
print()

 10.134341 seconds (24.32 M allocations: 1.168 GiB, 4.40% gc time)


In [1]:
mutable struct Data
    name::String
    data::Array
    factor_1::Array
    factor_2::Array
end 

In [2]:
mutable struct FoldData 
    name::String 
    train::Data
    test::Data
end 

In [42]:
index = GE_CDS_TPM[:,1]
data = GE_CDS_TPM[:,2:end] 
cols = names(data)
# log transforming
data = log10.(Array(data) .+ 1)
# remove least varying genes
ge_var = var(data,dims = 1) 
ge_var_med = median(ge_var)
# high variance only 
hvg = getindex.(findall(ge_var .> ge_var_med),2)
data = data[:,hvg]
cols = cols[hvg]
# verify that the operation worked
sum(var(data, dims = 1) .< ge_var_med)
# split into train test
nsamples = length(index)
indices = shuffle(Array{Int}(1:nsamples))
nfolds = 5
foldsize = Int(nsamples / 5)

folds = Array{FoldData}(undef,5)
for i in 1:nfolds
    tst_idx = indices[(i - 1) * foldsize + 1: i * foldsize]
    tr_idx = setdiff(indices, tst_idx)
    test = Data("test", data[tst_idx,:], index[tst_idx], cols)
    train = Data("train", data[tr_idx,:], index[tr_idx], cols)
    fold_data = FoldData("fold_$i", train, test)
    folds[i] = fold_data
end
folds[1].train.factor_1
folds[1].train.data





240×9798 Matrix{Float64}:
 0.60585    1.02385   0.232412   1.16368    …  0.00803749  3.84372  3.78002
 0.0881841  0.499486  0.188744   0.463801      1.61029     4.01487  3.991
 0.736662   0.594659  0.0591334  0.12302       1.66703     3.81244  3.70276
 1.17896    1.425     1.01698    1.6321        0.0         4.10253  3.62215
 0.129352   0.704893  0.144021   0.662844      1.47628     3.8845   3.60522
 0.117166   0.49951   0.0635713  0.469279   …  0.00727938  4.02401  4.04411
 0.0        0.930786  0.269041   0.158783      1.6067      3.52121  3.53603
 0.232632   0.644945  0.0490343  0.0621038     0.602884    3.91825  3.77371
 0.371633   0.798888  0.113777   0.071067      0.0         3.77846  3.6158
 0.365782   1.11457   0.108813   0.127687      0.0173291   3.78302  3.40982
 ⋮                                          ⋱  ⋮                    
 0.0386816  1.09603   0.427607   0.385448      0.0         3.77966  3.88506
 0.414905   0.87601   0.157483   0.670209      0.00490658  3.88181  3.18

In [43]:
function prep_data(data::Data; device = gpu)
    ## data preprocessing
    ### remove index columns, log transform
    n = length(data.factor_1)
    m = length(data.factor_2)
    values = Array{Float32,2}(undef, (1, n * m))
    print(size(values))
    factor_1_index = Array{Int32,1}(undef, n * m)
    factor_2_index = Array{Int32,1}(undef, n * m)
    # d3_index = Array{Int32,1}(undef, n * m)
    for i in 1:n
        for j in 1:m
            index = (i - 1) * m + j 
            values[1, index] = data.data[i, j]
            factor_1_index[index] = i # Int
            factor_2_index[index] = j # Int 
            # d3_index[index] = data.d3_index[i] # Int 
        end
    end
    return (device(factor_1_index), device(factor_2_index)), device(values)
end 

prep_data (generic function with 1 method)

In [49]:
X_, Y_ = prep_data(folds[5].train)

(1, 2351520)

((Int32[1, 1, 1, 1, 1, 1, 1, 1, 1, 1  …  240, 240, 240, 240, 240, 240, 240, 240, 240, 240], Int32[1, 2, 3, 4, 5, 6, 7, 8, 9, 10  …  9789, 9790, 9791, 9792, 9793, 9794, 9795, 9796, 9797, 9798]), Float32[0.30796877 1.0043808 … 3.742893 3.4990356])

In [51]:
for fold_data in folds
    X_, Y_ = prep_data(fold_data.train)
    # train embedding, retrieve model
end

(1, 2351520)

(1, 2351520)

(1, 2351520)

(1, 2351520)

(1, 2351520)