# 210718 Compile edits

In [1]:
using JSON
using DataFrames
using FilePathsBase
using FilePathsBase: /
using CSV

## Setup

In [2]:
ENV["COLUMNS"] = 400
ENV["LINES"] = 100

100

In [3]:
DATESTR = "210718"
NBNAME = "$DATESTR-compile-edits"

"210718-compile-edits"

In [4]:
infiles = Dict(
    :db => p"../../data/intermediate/210401-database-v2-fix-species-overlaps/210424-compile-fixes/",
    :_210710_removed_genomes => p"../../data/processed/210428-database-v1.2-fix-remaining-overlaps/210710-remove-min-inter-outliers/210710-removed-genomes.csv",
);

In [5]:
intermediate_out = p"data-intermediate" / NBNAME
isdir(intermediate_out) || mkdir(intermediate_out)

true

## Load data

### Taxonomy

In [6]:
taxa = DataFrame(CSV.File(string(infiles[:db] / "taxa.csv")));

In [7]:
genome_assignments = Vector{Int}(open(JSON.parse, infiles[:db] / "genome-taxon-assignments.json"))
ngenomes = length(genome_assignments)

50752

In [8]:
tid_to_tidx = Dict(id => i for (i, id) in enumerate(taxa[!, :id]))

taxon_index(tid::Integer) = tid_to_tidx[tid]
taxon_index(tids::AbstractVector{<:Integer}) = taxon_index.(tids)

lookup_tid(tid, cols=:) = taxa[taxon_index(tid), cols]

lookup_tid (generic function with 2 methods)

### 210710-remove-min-inter-outliers removed genomes

In [9]:
removed_genomes_210710 = DataFrame(CSV.File(infiles[:_210710_removed_genomes]));

## Init new taxonomy data

In [10]:
new_taxa = taxa[:, [:id, :ncbi_id, :name, :rank, :parent_id, :in_v12, :manual_threshold, :report]];

In [11]:
genome_assignments_new = copy(genome_assignments);

In [12]:
# Next ID not in taxa_new
new_taxid() = maximum(new_taxa[!, :id]) + 1

new_taxid (generic function with 1 method)

In [13]:
TAXON_DEFAULTS = (ncbi_id=missing, rank=missing, parent_id=missing, in_v12=false, manual_threshold=NaN, report=true);

In [14]:
"""Add new row to new_taxa table."""
function new_taxon(; kw...)
    id = new_taxid()
    values = merge((id=id,), TAXON_DEFAULTS, NamedTuple(kw))
    push!(new_taxa, values)
    return values
end

new_taxon

In [15]:
taxon_index_new(tid::Integer) = only(findall(==(tid), new_taxa[!, :id]))
taxon_index_new(tids::AbstractVector{<:Integer}) = taxon_index_new.(tids)

taxon_index_new (generic function with 2 methods)

In [16]:
taxon_extra = Dict{Int, Any}()

Dict{Int64,Any}()

## Assign taxon keys

In [17]:
KEY_PREFIX = "gambit/refseq-curated"

"gambit/refseq-curated"

### NCBI taxa

In [18]:
new_taxa[!, :key] = [ismissing(ncbi_id) ? missing : "ncbi/taxonomy/$ncbi_id" for ncbi_id in new_taxa.ncbi_id];

### Subgroup taxa

In [19]:
for row in eachrow(new_taxa)
    if ismissing(row.ncbi_id) && !isnothing(match(r".* subgroup \d", row.name))
        row.key = string(KEY_PREFIX, "/", replace(row.name, " " => "-"))
    end
end

### Remaining

In [20]:
new_taxa[ismissing.(new_taxa.key), :]

Unnamed: 0_level_0,id,ncbi_id,name,rank,parent_id,in_v12,manual_threshold,report,key
Unnamed: 0_level_1,Int64,Int64?,String,String?,Int64?,Bool,Float64,Bool,String?
1,1948,missing,Lacticaseibacillus casei/paracasei,missing,281,0,,1,missing


In [21]:
new_taxa[taxon_index_new(1948), :key] = "$KEY_PREFIX/Lacticaseibacillus-casei_paracasei"

"gambit/refseq-curated/Lacticaseibacillus-casei_paracasei"

## Integrate 210710-remove-min-inter-outliers

In [22]:
for row in eachrow(removed_genomes_210710)
    @assert genome_assignments[row.genome_index] == row.taxon_id
    genome_assignments_new[row.genome_index] = 0
end

## Escherichia / Shigella



* Escherishia / Shigella (genus)
    * E. coli / Shigella (species)
        * E. coli subgroups...
        * Shigella (No rank)
            * Shigella species...
    * Other Escherishia species...

### Remove old Escherichia and E. coli taxa

In [23]:
escherichia_old = only(filter(row -> row.name == "Escherichia", eachrow(taxa)))

delete!(new_taxa, new_taxa[!, :id] .== escherichia_old.id);

In [24]:
# Remove old E. coli
ecoli_old = only(filter(row -> row.name == "Escherichia coli", eachrow(taxa)))

delete!(new_taxa, new_taxa[!, :id] .== ecoli_old.id);

In [25]:
# Assert no genomes to reassign
@assert !any(==(escherichia_old), genome_assignments_new)
@assert !any(==(ecoli_old), genome_assignments_new)

### Create new taxa

In [26]:
# New Escherichia/Shigella genus
escherichia_shigella = new_taxon(key="$KEY_PREFIX/Escherichia_Shigella", name="Escherichia/Shigella", rank="genus")

(id = 1949, ncbi_id = missing, rank = "genus", parent_id = missing, in_v12 = false, manual_threshold = NaN, report = true, key = "gambit/refseq-curated/Escherichia_Shigella", name = "Escherichia/Shigella")

In [27]:
# New E. coli/Shigella species
ecoli_shigella = new_taxon(key="$KEY_PREFIX/Escherichia_coli_Shigella", name="Escherichia coli/Shigella", rank="species", parent_id=escherichia_shigella.id)

(id = 1950, ncbi_id = missing, rank = "species", parent_id = 1949, in_v12 = false, manual_threshold = NaN, report = true, key = "gambit/refseq-curated/Escherichia_coli_Shigella", name = "Escherichia coli/Shigella")

### Reassign taxon parents

In [28]:
# Reassign children of Escherichia -> new Escherichia/Shigella
for row in filter(row -> isequal(row.parent_id, escherichia_old.id), eachrow(new_taxa))
    println("$(row.id) $(row.name)")
    row.parent_id = escherichia_shigella.id
end

628 Escherichia fergusonii
1440 Escherichia albertii


In [29]:
# Reassign children of E. coli -> new E. coli/Shigella
for row in filter(row -> isequal(row.parent_id, ecoli_old.id), eachrow(new_taxa))
    println("$(row.id) $(row.name)")
    row.parent_id = ecoli_shigella.id
end

1931 Escherichia coli subgroup 1
1932 Escherichia coli subgroup 2
1933 Escherichia coli subgroup 3


### Update Shigella

In [30]:
shigella = only(filter(row -> row.name == "Shigella", eachrow(new_taxa)))

Unnamed: 0_level_0,id,ncbi_id,name,rank,parent_id,in_v12,manual_threshold,report,key
Unnamed: 0_level_1,Int64,Int64?,String,String?,Int64?,Bool,Float64,Bool,String?
154,161,620,Shigella,genus,missing,1,,1,ncbi/taxonomy/620


In [31]:
shigella.parent_id = ecoli_shigella.id
shigella.rank = missing
shigella.manual_threshold = 0
;

## Additional taxon data

In [32]:
@assert !any(ismissing, new_taxa.key)

In [33]:
let
    parent_id_values = unique(skipmissing(new_taxa.parent_id))
    new_taxa[!, :is_leaf] = map(!∈(parent_id_values), new_taxa.id)
end;

In [34]:
new_taxa[!, :plot_label] = string.(new_taxa.id, " ", new_taxa.name);

## Review custom taxa

In [35]:
new_taxa[ismissing.(new_taxa.ncbi_id), :]

Unnamed: 0_level_0,id,ncbi_id,name,rank,parent_id,in_v12,manual_threshold,report,key,is_leaf,plot_label
Unnamed: 0_level_1,Int64,Int64?,String,String?,Int64?,Bool,Float64,Bool,String?,Bool,String
1,1901,missing,Prochlorococcus marinus subgroup 1,missing,803,0,,0,gambit/refseq-curated/Prochlorococcus-marinus-subgroup-1,1,1901 Prochlorococcus marinus subgroup 1
2,1902,missing,Prochlorococcus marinus subgroup 2,missing,803,0,,0,gambit/refseq-curated/Prochlorococcus-marinus-subgroup-2,1,1902 Prochlorococcus marinus subgroup 2
3,1903,missing,Prochlorococcus marinus subgroup 3,missing,803,0,,0,gambit/refseq-curated/Prochlorococcus-marinus-subgroup-3,1,1903 Prochlorococcus marinus subgroup 3
4,1904,missing,Prochlorococcus marinus subgroup 4,missing,803,0,,0,gambit/refseq-curated/Prochlorococcus-marinus-subgroup-4,1,1904 Prochlorococcus marinus subgroup 4
5,1905,missing,Clostridium botulinum subgroup 1,missing,929,0,,0,gambit/refseq-curated/Clostridium-botulinum-subgroup-1,1,1905 Clostridium botulinum subgroup 1
6,1906,missing,Clostridium botulinum subgroup 2,missing,929,0,,0,gambit/refseq-curated/Clostridium-botulinum-subgroup-2,1,1906 Clostridium botulinum subgroup 2
7,1907,missing,Clostridium botulinum subgroup 3,missing,929,0,,0,gambit/refseq-curated/Clostridium-botulinum-subgroup-3,1,1907 Clostridium botulinum subgroup 3
8,1908,missing,Bacillus cereus subgroup 1,missing,899,0,0.6,0,gambit/refseq-curated/Bacillus-cereus-subgroup-1,1,1908 Bacillus cereus subgroup 1
9,1909,missing,Bacillus cereus subgroup 2,missing,899,0,0.55,0,gambit/refseq-curated/Bacillus-cereus-subgroup-2,1,1909 Bacillus cereus subgroup 2
10,1910,missing,Bacillus cereus subgroup 3,missing,899,0,,0,gambit/refseq-curated/Bacillus-cereus-subgroup-3,1,1910 Bacillus cereus subgroup 3


## Write output

In [36]:
CSV.write(intermediate_out / "taxa.csv", new_taxa)

p"data-intermediate/210718-compile-edits/taxa.csv"

In [37]:
open(intermediate_out / "genome-assignments.json", "w") do f
    JSON.print(f, genome_assignments_new)
end