In [1]:
] activate "/Users/harrison/ELSI/bioxp/BioXP"

[32m[1m Activating[22m[39m environment at `~/ELSI/bioxp/BioXP/Project.toml`


In [2]:
using BioXP
using ProgressMeter
using BenchmarkTools
using Profile


┌ Info: Precompiling BioXP [5370ad63-a488-43fb-b9e3-b51d93a2b4c4]
└ @ Base loading.jl:1278


In [3]:
pathof(BioXP)

"/Users/harrison/ELSI/bioxp/BioXP/src/BioXP.jl"

### Check nthreads

In [5]:
Threads.nthreads()

1

### Put everything in one function for benchmarking

In [9]:
function run_all()
    ## Organism JSONs from ecg (JGI)
    # input_dir = "data/input/rids/"
    input_dir = "../test/data/rids-methanogens/"

    ## Master file from ecg (with dgs added from `add_dgs_to_master` .py files)
    rstructs_path = "../test/data/master_from_redges-og-submission.json"

    ## User defined seeds/targets
    seeds_path = "../data/seeds/encel_papers_2019.json"
    targets_path = "../data/seeds/seeds.json"
    sid_name = "Contains KEGGID P"
    tid_name = "targets_Freilich09";

    ## Path to write to
    # write_dir = joinpath("data","output",sid_name)
    write_dir = joinpath("data","output",sid_name*"-methanogens")

    if !ispath(write_dir)
        mkpath(write_dir)
    end
    
    sids = readkeyedids(seeds_path)[sid_name]
    tids = readkeyedids(targets_path)[tid_name]
    rstructs = readmaster(rstructs_path)

#     for path in readdir(input_dir)

#         org_dir = joinpath(input_dir,path)

#         if isdir(org_dir) &  !startswith(path,".") 

#             if !ispath(joinpath(write_dir,path))
#                 mkpath(joinpath(write_dir,path))
#             end

    p = Progress(length(readdir(input_dir)),desc="$input_dir")

#     println(org_dir)
    Threads.@threads for (i,fname) in collect(enumerate(readdir(input_dir)))
#             Random.seed!(seedoffset+i)
        if !startswith(fname,".") 
            rids = readids(joinpath(input_dir,fname))
            write_path = joinpath(write_dir,basename(input_dir),fname)
            x, t, cids, X, Y = expand(rstructs,rids,sids,tids=tids,write_path=write_path) ## new results
            next!(p)
        end
    end
    
    simple_output_dir = write_dir #joinpath("data","output",sid_name)
    
    formatbioxpoutput(simple_output_dir)
    
#     for path in readdir(simple_output_dir)
# #         org_dir = joinpath(simple_output_dir,path)
#         @show path
#         if !startswith(path,".") & !isdir(path) 
#             formatbioxpoutput(joinpath(simple_output_dir,path))
#         end
#     end
end

run_all (generic function with 1 method)

In [12]:
function run_all_no_write()
    ## Organism JSONs from ecg (JGI)
    # input_dir = "data/input/rids/"
    input_dir = "../test/data/rids-methanogens/"

    ## Master file from ecg (with dgs added from `add_dgs_to_master` .py files)
    rstructs_path = "../test/data/master_from_redges-og-submission.json"

    ## User defined seeds/targets
    seeds_path = "../data/seeds/encel_papers_2019.json"
    targets_path = "../data/seeds/seeds.json"
    sid_name = "Contains KEGGID P"
    tid_name = "targets_Freilich09";

    sids = readkeyedids(seeds_path)[sid_name]
    tids = readkeyedids(targets_path)[tid_name]
    rstructs = readmaster(rstructs_path)

    p = Progress(length(readdir(input_dir)),desc="$input_dir")

    Threads.@threads for (i,fname) in collect(enumerate(readdir(input_dir)))
        if !startswith(fname,".") 
            rids = readids(joinpath(input_dir,fname))
            x, t, cids, X, Y = expand(rstructs,rids,sids,tids=tids) ## new results
            next!(p)
        end
    end
    
end

run_all_no_write (generic function with 1 method)

In [10]:
if true & false
    println(1)
end

In [11]:
@benchmark run_all()

[32m../test/data/rids-methanogens/ 86%|██████████████████   |  ETA: 0:00:00[39m

BenchmarkTools.Trial: 
  memory estimate:  233.30 MiB
  allocs estimate:  562073
  --------------
  minimum time:     1.003 s (4.15% GC)
  median time:      1.009 s (4.49% GC)
  mean time:        1.048 s (8.16% GC)
  maximum time:     1.125 s (14.16% GC)
  --------------
  samples:          5
  evals/sample:     1

In [13]:
@benchmark run_all_no_write()

[32m../test/data/rids-methanogens/ 86%|██████████████████   |  ETA: 0:00:00[39m

BenchmarkTools.Trial: 
  memory estimate:  220.26 MiB
  allocs estimate:  341619
  --------------
  minimum time:     943.702 ms (4.19% GC)
  median time:      1.004 s (9.08% GC)
  mean time:        991.628 ms (8.46% GC)
  maximum time:     1.032 s (12.24% GC)
  --------------
  samples:          6
  evals/sample:     1

Benchmarking "Contains KEGGID P-Methanogens" without sparse Matrices:

```
BenchmarkTools.Trial: 
  memory estimate:  233.30 MiB
  allocs estimate:  562073
  --------------
  minimum time:     1.036 s (3.54% GC)
  median time:      1.080 s (4.31% GC)
  mean time:        1.093 s (7.90% GC)
  maximum time:     1.162 s (13.76% GC)
  --------------
  samples:          5
  evals/sample:     1
```

> Why doesn't it seem faster after moving to sparse arrays...

## Try the KEGG expansion I was benchmarking with Josh's code and my code

In [7]:
rstructs = readmaster("../../../liam_goldford_ecod/mydata/kegg_2021-01-05/masterx.json")
rids = readids("../../../liam_goldford_ecod/mydata/kegg_2021-01-05/rids_with_element_conservation-kegg.json")
sids = readkeyedids("../test/data/seeds/seeds.json")["Goldford-SAFR-032"]
write_path = nothing

In [8]:
@profile expand(rstructs,rids,sids,write_path=write_path)

(  [1   ]  =  1
  [12  ]  =  1
  [13  ]  =  1
  [173 ]  =  1
  [186 ]  =  1
  [231 ]  =  1
  [536 ]  =  1
  [1697]  =  1, 8636-element SparseArrays.SparseVector{Int64,Int64} with 0 stored entries, ["C00001", "C00404", "C02174", "C00002", "C00138", "C05359", "C00009", "C00008", "C00139", "C00013"  …  "C22295", "C22233", "C22296", "C22297", "C22298", "C22277", "C22299", "C22300", "C22301", "C22302"], SparseArrays.SparseVector{Int64,Int64}[  [1   ]  =  1
  [12  ]  =  1
  [13  ]  =  1
  [173 ]  =  1
  [186 ]  =  1
  [231 ]  =  1
  [536 ]  =  1
  [1697]  =  1,   [1   ]  =  1
  [6   ]  =  1
  [11  ]  =  1
  [12  ]  =  1
  [13  ]  =  1
  [18  ]  =  1
  [22  ]  =  1
  [171 ]  =  1
  [172 ]  =  1
  [173 ]  =  1
          ⋮
  [231 ]  =  1
  [254 ]  =  1
  [284 ]  =  1
  [397 ]  =  1
  [517 ]  =  1
  [536 ]  =  1
  [882 ]  =  1
  [1697]  =  1
  [2255]  =  1
  [4072]  =  1
  [7145]  =  1,   [1   ]  =  1
  [6   ]  =  1
  [11  ]  =  1
  [12  ]  =  1
  [13  ]  =  1
  [15  ]  =  1
  [17  ]  =  1
  [18

In [9]:
Profile.print(format=:flat)

 Count  Overhead File                    Line Function
   159       159 @Base/Base.jl             33 getproperty
    23        23 @Base/Base.jl              ? ht_keyindex(::Dict{String,React...
     1         0 @Base/abstractarray.jl    57 axes
     1         0 @Base/abstractarray.jl    75 axes
     1         0 @Base/abstractarray.jl   971 copymutable
     8         0 @Base/abstractarray.jl  2009 foreach
     1         0 @Base/abstractarray.jl   989 isempty
     1         0 @Base/abstractarray.jl  2162 map
     8         0 @Base/abstractarray.jl   674 similar
     8         0 @Base/abstractarray.jl   675 similar
     1         0 @Base/abstractdict.jl     17 haskey
     1         0 @Base/abstractset.jl      46 #261
     1         0 @Base/abstractset.jl     138 intersect!
     1         0 @Base/abstractset.jl     422 mapfilter(::Base.var"#64#65"{Ba...
     1         0 @Base/abstractset.jl      92 union!(::Set{String}, ::Array{S...
     1         0 @Base/abstractset.jl     417 unsafe_filt

In [10]:
@benchmark x, t, cids, X, Y = expand(rstructs,rids,sids,write_path=write_path) samples=3 seconds=150

BenchmarkTools.Trial: 
  memory estimate:  6.82 GiB
  allocs estimate:  328290
  --------------
  minimum time:     8.464 s (1.00% GC)
  median time:      8.676 s (1.05% GC)
  mean time:        8.636 s (1.08% GC)
  maximum time:     8.768 s (1.04% GC)
  --------------
  samples:          3
  evals/sample:     1

In [16]:
@benchmark x, t, cids, X, Y = expand(rstructs,rids,sids,write_path=write_path) samples=3 seconds=150

BenchmarkTools.Trial: 
  memory estimate:  2.76 GiB
  allocs estimate:  54095
  --------------
  minimum time:     29.268 s (0.65% GC)
  median time:      30.945 s (0.27% GC)
  mean time:        31.037 s (0.36% GC)
  maximum time:     32.897 s (0.19% GC)
  --------------
  samples:          3
  evals/sample:     1

### Inputs

In [5]:
## Organism JSONs from ecg (JGI)
# input_dir = "data/input/rids/"
input_dir = "../test/data/input/rids-methanogens/"

## Master file from ecg (with dgs added from `add_dgs_to_master` .py files)
rstructs_path = "../test/data/input/rstructs/master_from_redges-og-submission.json"

## User defined seeds/targets
seeds_path = "../data/input/seeds/encel_papers_2019.json"
targets_path = "../data/input/seeds/seeds.json"
sid_name = "Contains KEGGID P"
tid_name = "targets_Freilich09";

## Path to write to
# write_dir = joinpath("data","output",sid_name)
write_dir = joinpath("data","output",sid_name*"-methanogens")

if !ispath(write_dir)
    mkpath(write_dir)
end

"data/output/Contains KEGGID P-methanogens"

### Run expansion

Double check number of accessible threads

In [6]:
Threads.nthreads()

1

Looping such that seeds are logically assigned

In [10]:
sids = readkeyedids(seeds_path)[sid_name]
tids = readkeyedids(targets_path)[tid_name]
rstructs = readmaster(rstructs_path)

for path in readdir(input_dir)
    
    org_dir = joinpath(input_dir,path)
    
    if isdir(org_dir) &  !startswith(path,".") 
        
        if !ispath(joinpath(write_dir,path))
            mkpath(joinpath(write_dir,path))
        end
        
        p = Progress(length(readdir(org_dir)),desc="$path")
        
        println(org_dir)
        Threads.@threads for (i,fname) in collect(enumerate(readdir(org_dir)))
#             Random.seed!(seedoffset+i)
            if !startswith(fname,".") 
                rids = readids(joinpath(org_dir,fname))
                write_path = joinpath(write_dir,basename(org_dir),fname)
                x, t, cids, X, Y = expand(rstructs,rids,sids,tids,write_path) ## new results
                next!(p)
            end
        end
    end
end

data/input/rids-methanogens/archaea


[32marchaea 50%|██████████████████████                      |  ETA: 0:00:00[39m

### Format output

In [11]:
simple_output_dir = write_dir #joinpath("data","output",sid_name)
    
for path in readdir(simple_output_dir)
    org_dir = joinpath(simple_output_dir,path)
    if !startswith(path,".") 
        formatbioxpoutput(org_dir)
    end
end