# 220501 DB overlaps summary draft

In [2]:
import os

In [3]:
import numpy as np
import pandas as pd
import h5py as h5
from tqdm import tqdm

In [4]:
from gambit.db import ReferenceDatabase, Taxon

## Code

In [5]:
def subtree_genomes(taxon):
    return [genome for t in taxon.traverse() for genome in t.genomes]

In [6]:
def ancestor_of_rank(taxon, rank: str):
    for ancestor in taxon.ancestors(True):
        if ancestor.rank == rank:
            return ancestor
    return None

def lineage(taxon, ranks):
    return [ancestor_of_rank(taxon, rank) for rank in ranks]


In [7]:
def indices_to_slice(indices):
    """Covert integer arrays of (n ... m) to slice(n, m+1)."""
    if np.array_equal(indices, range(indices[0], indices[-1] + 1)):
        return slice(indices[0], indices[-1] + 1)
    else:
        return indices

In [8]:

def fix_nullable_int_col(values):
	"""Fix column containing ints/Nones after Pandas coerces it to float data type."""
	return np.asarray([None if pd.isnull(v) else int(v) for v in values], dtype=object)

## Load database

In [9]:
refdb = ReferenceDatabase.load_from_dir('/home/jared/projects/gambit/data/databases/refseq-curated/1.0-beta2/')

In [10]:
gset = refdb.genomeset

In [11]:
session = refdb.session

## Inspect

In [12]:
len(refdb.signatures)

48224

In [13]:
{value: gset.taxa.filter_by(rank=value).count() for (value,) in refdb.session.query(Taxon.rank).distinct()}

{None: 50, 'genus': 452, 'species': 1415}

## ?

In [14]:
intermediate_dir = '/home/jared/code/gambit/gambit-publication/intermediate-data/db-pw-dists'

## ?

In [15]:
min_leaf_dists = pd.read_csv(f'{intermediate_dir}/taxa-min-dists.csv', index_col=0)
max_leaf_dists = pd.read_csv(f'{intermediate_dir}/taxa-max-dists.csv', index_col=0)

In [16]:
leaf_tids = min_leaf_dists.index

assert np.array_equal(max_leaf_dists.index, leaf_tids)
assert np.array_equal(min_leaf_dists.columns, leaf_tids.map(str))
assert np.array_equal(max_leaf_dists.columns, leaf_tids.map(str))

min_leaf_dists.columns = max_leaf_dists.columns = leaf_tids

## ??

In [17]:
taxa_by_id = {taxon.id: taxon for taxon in gset.taxa}
all_tids = sorted(taxa_by_id)

In [18]:
rows = []

for tid in all_tids:
    taxon = taxa_by_id[tid]
    
    subtree_leaves = [l.id for l in taxon.leaves()]
    in_subtree = np.in1d(leaf_tids, subtree_leaves)
    
    diam = max_leaf_dists.loc[subtree_leaves, subtree_leaves].values.max()
    
    inter_dists = min_leaf_dists.values[in_subtree, :]
    inter_dists = np.ma.masked_array(inter_dists, np.broadcast_to(in_subtree[None, :], inter_dists.shape))
    
    am_row, am_col = np.unravel_index(inter_dists.argmin(), inter_dists.shape)
    min_inter_dist = inter_dists[am_row, am_col]
    min_inter_tid = leaf_tids[am_col]
    
    rows.append((
        tid,
        taxon.name,
        'none' if taxon.rank is None else taxon.rank,
        taxon.ncbi_id,
        np.nan if taxon.distance_threshold == 0 else taxon.distance_threshold,
        diam,
        min_inter_dist,
        min_inter_tid,
        taxa_by_id[min_inter_tid].name,
    ))

df = pd.DataFrame(
    rows,
    columns=['id', 'name', 'rank', 'ncbi_id', 'threshold', 'diameter', 'min_inter_dist', 'min_inter_taxon_id', 'min_inter_taxon_name'],
)

df.set_index('id', inplace=True)
df['ncbi_id'] = fix_nullable_int_col(df['ncbi_id'])

In [19]:
assert not np.any(df['min_inter_dist'] < df['threshold'])

In [20]:
df['has_overlap'] = df['min_inter_dist'] < df['diameter']

In [21]:
df['has_overlap'].sum()

171

In [22]:
diameter_is_threshold = np.isclose(df['diameter'], df['threshold'], atol=1e-4)

In [23]:
(~diameter_is_threshold).sum()

256

In [26]:
df[df['has_overlap']]

Unnamed: 0_level_0,name,rank,ncbi_id,threshold,diameter,min_inter_dist,min_inter_taxon_id,min_inter_taxon_name,has_overlap
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Mobiluncus,genus,2050,0.938501,0.991801,0.987895,1241,Actinobaculum massiliense,True
2,Gordonia,genus,2053,0.925864,0.978936,0.974594,1754,Nocardia farcinica,True
4,Kitasatospora,genus,2063,0.818638,0.947083,0.861725,1820,Streptomyces pyridomyceticus,True
5,Shewanella,genus,22,0.943568,0.995506,0.993230,877,Salmonella enterica,True
10,Mycoplasma,genus,2093,0.934762,1.000000,0.983960,1289,Mesoplasma florum,True
...,...,...,...,...,...,...,...,...,...
1901,Streptococcus pseudopneumoniae subgroup 2,none,,0.500000,0.666717,0.580635,1424,Streptococcus mitis,True
1905,Pseudomonas syringae subgroup 4,none,,0.250000,0.388258,0.295101,1899,Pseudomonas amygdali subgroup 3,True
1909,Shigella boydii,species,621,,0.296933,0.204837,1916,Shigella dysenteriae subgroup 2,True
1910,Shigella dysenteriae,species,622,,0.560220,0.204837,1913,Shigella boydii subgroup 1,True


In [28]:
df.to_csv('/home/jared/projects/gambit/tmp/220501-db-overlaps-summary.csv')

In [27]:
pd.isnull(df['threshold']).sum()

20

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
plt.figure(figsize=(16, 8))
sns.boxenplot(data=df, x='diameter', y='rank')
plt.xscale('logit')