# 210328 Compile edits

In [1]:
using JSON
using CSV

In [2]:
using DataFrames
using FilePathsBase
using FilePathsBase: /
using DataStructures

## Setup

In [3]:
ENV["COLUMNS"] = 400

400

In [4]:
DATESTR = "210328"
NBNAME = "$DATESTR-compile-edits"

"210328-compile-edits"

In [5]:
infiles = Dict(
    :distances => p"../../data/intermediate/200727-find-overlaps/genome-pw-distances.raw-float32",
    :formatted => p"data-intermediate/210303-format-data/",
    :species_overlaps => p"data-intermediate/210323-find-species-overlaps/",
    :problem_species => p"data-intermediate/210327-fix-problem-species/",
    :identical_genomes => p"data-intermediate/210317-find-identical-genomes/",
);

In [6]:
intermediate_out = p"data-intermediate" / NBNAME
isdir(intermediate_out) || mkdir(intermediate_out)

true

## Load data

### Database

In [7]:
genomes_df = hcat(
    DataFrame(CSV.File(infiles[:formatted] / "genomes-v1.1.csv")),
    DataFrame(CSV.File(infiles[:species_overlaps] / "genomes-addendum.csv")),
)
insertcols!(genomes_df, 1, :index => axes(genomes_df, 1))

ngenomes = nrow(genomes_df)

50752

In [8]:
species_df = hcat(
    DataFrame(CSV.File(infiles[:formatted] / "species-v1.2.csv")),
    DataFrame(CSV.File(infiles[:species_overlaps] / "species-addendum.csv")),
)
insertcols!(species_df, 1, :index => axes(species_df, 1))

nspecies = nrow(species_df)

1438

In [9]:
genera_df = DataFrame(CSV.File(infiles[:formatted] / "genera-v1.2.csv"))

ngenera = nrow(genera_df)

462

In [10]:
gidxs_by_species = [Int[] for i in 1:nspecies]

for (i, si) in enumerate(genomes_df[!, :species_v12_idx1])
    si == 0 || push!(gidxs_by_species[si], i)
end

### Identical genome groups

In [11]:
identical_genome_groups = open(JSON.parse, infiles[:identical_genomes] / "identical-genome-groups.json");

### Problem species fixes

In [12]:
problem_genome_fixes = open(JSON.parse, infiles[:problem_species] / "fixes.json");

## Build master taxon table

Table containing all current taxa from v1.2 DB

In [13]:
taxa_df = let
    df = genera_df[:, [:db_id, :ncbi_id, :name]]
    rename!(df, :db_id => :id)
    df[:, :rank] .= "genus"
    allowmissing!(df, [:ncbi_id, :rank])
    
    df[:, :parent_id] = Vector{Union{Int, Missing}}(undef, nrow(df))
    df[:, :parent_id] .= missing
    
    let
        df2 = species_df[:, [:db_id, :taxid, :name, :genus_id]]
        rename!(df2, :db_id => :id, :taxid => :ncbi_id, :genus_id => :parent_id)
        df2[:, :rank] .= "species"
        append!(df, df2)
    end
    
    
    df[:, :in_v12] .= true
    df[:, :manual_threshold] .= NaN
    df[:, :report] .= true
    df[:, :is_leaf] = df[:, :rank] .== "species"
    
    df
end

summary(taxa_df)

"1900×9 DataFrame"

In [14]:
taxon_idx(id::Int) = findfirst(==(id), taxa_df[!, :id])

taxon_idx (generic function with 1 method)

In [15]:
genome_assignments = [
    i > 0 ? species_df[i, :db_id] : 0
    for i in genomes_df[!, :species_v12_idx1]
];

## Apply edits

### Identical genome groups

In [16]:
for group in identical_genome_groups
    sid = only(unique(genome_assignments[gi] for gi in group["genome_indices"]))
    genome_assignments[group["genome_indices"]] .= 0
    genome_assignments[group["exemplar"]] = sid
end

### Problem species fixes

#### Deleted genomes

In [17]:
genome_assignments[problem_genome_fixes["deleted_genomes"]] .= 0;

#### Threshold assignments

In [18]:
for item in problem_genome_fixes["threshold_assignments"]
    sid = species_df[item["species_index"], :db_id]
    taxa_df[taxon_idx(sid), :manual_threshold] = item["threshold"]
end

#### Split species into subgroups

In [19]:
for item in problem_genome_fixes["species_splits"]
    sid = species_df[item["species_index"], :db_id]
    @show sid
    
    # Modifications to parent species taxon
    parent_row = taxa_df[taxon_idx(sid), :]
    parent_row[:manual_threshold] = 0.
    parent_row[:is_leaf] = false
    
    # Unassign all genomes from species
#     genome_assignments[gidxs] .= 0
    
    next_id = maximum(taxa_df[!, :id]) + 1
    
    for (i, gidxs) in enumerate(item["subgroup_genomes"])
        # Create child taxon
        push!(taxa_df, (
            id=next_id,
            ncbi_id=missing,
            name=string(parent_row[:name], " subgroup ", i),
            rank=missing,
            parent_id=sid,
            in_v12=false,
            manual_threshold=NaN,
            report=false,
            is_leaf=true,
        ))
        
        # Assign genomes
        for gi in gidxs
            # May have been deleted in previous steps
            genome_assignments[gi] == 0 && continue
            @assert genome_assignments[gi] == sid "$(genome_assignments[gi]) != $sid"
            genome_assignments[gi] = next_id
        end
        
        next_id += 1
    end
    
    # Check no genomes still assigned to parent
    # (Those not assigned to a subgroup should have been in list of deleted genomes)
    @assert sid ∉ genome_assignments
end

sid = 803
sid = 929
sid = 899


### Remove taxa with <2 genomes remaining

In [20]:
deleted_taxa = Int[]

for (sid, cnt) in counter(genome_assignments)
    sid == 0 && continue
    if cnt <= 1
        push!(deleted_taxa, sid)
        delete!(taxa_df, taxon_idx(sid))
        genome_assignments[genome_assignments .== sid] .= 0
    end
end

In [21]:
species_df[indexin(deleted_taxa, species_df[!, :db_id]), :]

Unnamed: 0_level_0,index,db_id,taxid,name,genus_id,ngenomes,migration_src_idxs1,migration_single_src,migration_identical,migration_1to1,diameter,outgoing_overlaps,incoming_overlaps
Unnamed: 0_level_1,Int64,Int64,Int64,String,Int64,Int64,String,Bool,Bool,Bool,Float64,Int64,Int64
1,8,1451,53427,Gordonia hirsuta,2,2,540,1,1,1,0.0,0,1
2,1151,1475,78259,Scardovia inopinata,378,2,1155,1,1,1,0.0,0,2
3,313,1211,1034943,Legionella massiliensis,111,2,722,1,1,1,0.0,0,1
4,400,1655,308892,Mesobacillus boroniphilus,141,2,126,1,1,1,0.0,0,1
5,410,1580,127891,Alkalihalobacillus wakoensis,144,2,159,1,1,1,0.0,0,1
6,1021,882,1356,Enterococcus sulfureus,341,2,475,1,1,1,0.0,0,1
7,1029,1670,71451,Enterococcus malodoratus,341,2,468,1,1,1,0.0,0,1
8,407,909,1411,Alkalihalobacillus akibai,144,2,119,1,1,1,0.0,0,2
9,720,747,940,Hydrogenobacter thermophilus,238,2,573,1,1,1,0.0,0,1
10,211,547,33056,Candidatus Kinetoplastibacterium crithidii,62,2,585,1,1,1,0.0,0,1


## Save data

In [22]:
CSV.write(intermediate_out / "taxa.csv", taxa_df);

In [23]:
open(intermediate_out / "deleted-taxa-db-ids.json", "w") do f
    JSON.print(f, deleted_taxa)
end

In [24]:
open(intermediate_out / "genome-taxon-assignments.json", "w") do f
    JSON.print(f, genome_assignments)
end