In [1]:
] activate "/Users/anarres/Documents/projects/BioXP"

[32m[1m Activating[22m[39m environment at `~/Documents/projects/BioXP/Project.toml`


Some visual sugar

In [2]:
# ] add ProgressMeter

In [3]:
using BioXP
using ProgressMeter
using Random
using JSON
# using BenchmarkTools

### Inputs

In [4]:
## Organism JSONs from ecg (JGI)
input_dir = "data/input/rids-methanogens/"

## Master file from ecg (with dgs added from `add_dgs_to_master` .py files)
rstructs_path = "data/input/rstructs/master_from_redges-og-submission.json"

## User defined seeds/targets
seeds_and_targets_path = "data/input/seeds/seeds.json"
sid_name = "Enceladus_20-SAFR-032"
tid_name = "targets_Freilich09";

#### Seed set randomization

In [5]:
compound_structs_path = "data/input/compound/"
n_runs = 100
n_swaps = 1000
beta = 20
sortkey = :exact_mass
zero_mass_behavior = "end"
rng_seed = 1234
# rng = MersenneTwister(1234);

### Path to write to

In [6]:
write_dir = "data/input/sid_randomizations-methanogens/";

if !ispath(write_dir)
    mkpath(write_dir)
end

"data/input/sid_randomizations-methanogens"

### Check rids in rstructs

In [22]:
rstructs = readmaster(rstructs_path)

Dict{String,Reaction} with 9934 entries:
  "R03857" => Reaction("R03857", ["C01832", "C00016"], ["C03221", "C01352"], Di…
  "R02250" => Reaction("R02250", ["C00422", "C00001"], ["C00641", "C00162"], Di…
  "R08640" => Reaction("R08640", ["C17224", "C00024", "C00001"], ["C17226", "C0…
  "R07506" => Reaction("R07506", ["C15778", "C00005", "C00080", "C00007"], ["C0…
  "R10331" => Reaction("R10331", ["C20518", "C00028"], ["C04638", "C00030"], Di…
  "R00340" => Reaction("R00340", ["C02107"], ["C00036", "C00001"], Dict{String,…
  "R06599" => Reaction("R06599", ["C12176"], ["C14721"], Dict{String,Any}())
  "R02433" => Reaction("R02433", ["C00506", "C00026"], ["C05528", "C00025"], Di…
  "R02769" => Reaction("R02769", ["C01200", "C00009"], ["C00686", "C00074", "C0…
  "R01081" => Reaction("R01081", ["C00121"], ["C00309"], Dict{String,Any}())
  "R07043" => Reaction("R07043", ["C05966"], ["C14813"], Dict{String,Any}())
  "R08152" => Reaction("R08152", ["C16504", "C00001"], ["C16506"], Dict{String,…

In [11]:
rids = readids(joinpath(input_dir,"archaea","2511231210.json"));

In [13]:
"R03038" in rids

true

In [14]:
length(rids)

727

In [28]:
setdiff(rids,(BioXP.remove_rids_not_in_rstructs(rstructs,rids)))

52-element Array{String,1}:
 "R03038"
 "R06209"
 "R03646"
 "R06280"
 "R05577"
 "R07157"
 "R02596"
 "R03919"
 "R04007"
 "R07443"
 "R11906"
 "R06200"
 "R11029"
 ⋮
 "R03660"
 "R03661"
 "R11893"
 "R08218"
 "R03662"
 "R03663"
 "R03664"
 "R02918"
 "R11891"
 "R11892"
 "R03665"
 "R11896"

In [23]:
BioXP.remove_rids_not_in_rstructs(rstructs,rids)

646-element Array{String,1}:
 "R02003"
 "R03898"
 "R03896"
 "R07399"
 "R00475"
 "R01341"
 "R04640"
 "R10716"
 "R10331"
 "R10325"
 "R10326"
 "R09375"
 "R09376"
 ⋮
 "R02777"
 "R06513"
 "R02098"
 "R02094"
 "R02100"
 "R05679"
 "R05680"
 "R00597"
 "R10552"
 "R03020"
 "R10496"
 "R10611"

There's a significant number of rids here that aren't in the version of kegg that I'm using. Wtf do I do here? If I just remove them that seems pretty biased. But if I update kegg and use them then that's also pretty biased...

This might not be as big an issue as I think. Many of these could be excluded because they contained glycans or aren't balanced.

### Make random seeds

In [30]:
rstructs = readmaster(rstructs_path)
compound_structs = readcompounds(compound_structs_path);

for path in readdir(input_dir)
    
    org_dir = joinpath(input_dir,path)
    
    if isdir(org_dir)
        
        if !ispath(joinpath(write_dir,path))
            mkpath(joinpath(write_dir,path))
        end
        
        p = Progress(length(readdir(org_dir)),desc="$path")
        
        Threads.@threads for (i,fname) in collect(enumerate(readdir(org_dir)))
            if endswith(fname,".json")

                rids = readids(joinpath(org_dir,fname))
#                 cids = list_biosystem_compounds_from_rids(rstructs,rids)
                cids = list_biosystem_compounds_from_rids(rstructs,BioXP.remove_rids_not_in_rstructs(rstructs,rids))
#                 cids = BioXP.identify_biosystem_compounds(rstructs,rids)
                write_path = joinpath(write_dir,basename(org_dir),fname)

                random_seed_sets = randomizecompounds(cids,
                                        compound_structs,
                                        n_runs,
                                        n_swaps,
                                        beta,
                                        sortkey,
                                        zero_mass_behavior,
                                        MersenneTwister(rng_seed))
                open(write_path,"w") do f
                    JSON.print(f, random_seed_sets, 2) #indent=2
                end

                next!(p)
            end
        end
    end
end


[32marchaea 86%|█████████████████████████████████████▊      |  ETA: 0:00:00[39m

## I'm using the same seed to initialize all the compound randomizaitons. This should be ok since it a single RNG object will still step through random numbers as it fills out the random seed set for a single organism. But I need to watch out to make sure that this doesn't cause my problems later. 

## I ALREADY HAD THIS FUNCTION `IDENTIFY_BIOSYSTEM_COMPOUNDS`
## SHOULD DELETE `list_biosystem_compounds_from_rids`