# 220507 DB pairwise distances

In [1]:
from pathlib import Path

In [2]:
import numpy as np
import pandas as pd
import h5py as h5
from tqdm import tqdm

In [3]:
from gambit.db import load_genomeset, Taxon

## Setup

In [4]:
DATESTR = '220507'
NBNAME = DATESTR + '-db-pw-dists'

In [5]:
infiles = dict(
    # wf_intermediate_dir=Path('/home/jared/code/gambit/gambit-publication/intermediate-data/db-pw-dists'),
    leaf_min_dists=Path('data-src/220430-leaf-min-dists.csv'),
    leaf_max_dists=Path('data-src/220430-leaf-max-dists.csv'),
    db_genomes=Path('/home/jared/projects/gambit/data/databases/refseq-curated/1.0-beta2/gambit-genomes-1.0b2-211111.db'),
)

In [6]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

outfiles = dict(
    main_table=processed_out / f'{DATESTR}-db-dists-summary.csv',
    overlaps_table=processed_out / f'{DATESTR}-db-overlaps.csv',
)

## Code

In [7]:
def highest_out_ancestor(in_taxon, out_taxon):
    ca = Taxon.common_ancestors([in_taxon, out_taxon])
    return out_taxon.lineage()[len(ca)]

In [8]:
def indices_to_slice(indices):
    """Covert integer arrays of (n ... m) to slice(n, m+1)."""
    if np.array_equal(indices, range(indices[0], indices[-1] + 1)):
        return slice(indices[0], indices[-1] + 1)
    else:
        return indices

In [9]:
def fix_nullable_int_col(values):
    """Fix column containing ints/Nones after Pandas coerces it to float data type."""
    return np.asarray([None if pd.isnull(v) else int(v) for v in values], dtype=object)

## Load database

In [10]:
session, gset = load_genomeset(infiles['db_genomes'])

## Load distance data

In [11]:
min_leaf_dists = pd.read_csv(infiles['leaf_min_dists'], index_col=0)
max_leaf_dists = pd.read_csv(infiles['leaf_max_dists'], index_col=0)

In [12]:
leaves_index = min_leaf_dists.index

assert np.array_equal(max_leaf_dists.index, leaves_index)
assert np.array_equal(min_leaf_dists.columns, leaves_index.map(str))
assert np.array_equal(max_leaf_dists.columns, leaves_index.map(str))

min_leaf_dists.columns = max_leaf_dists.columns = leaves_index

leaf_tids = leaves_index.values

## Make tables

In [13]:
taxa_by_id = {taxon.id: taxon for taxon in gset.taxa}
all_tids = sorted(taxa_by_id)

In [14]:
main_rows = []
overlap_rows = []

for tid in tqdm(all_tids):
    taxon = taxa_by_id[tid]
    threshold = np.nan if taxon.distance_threshold == 0 else taxon.distance_threshold
    
    subtree_leaves = [l.id for l in taxon.leaves()]
    in_subtree = np.in1d(leaf_tids, subtree_leaves)
    
    diam = max_leaf_dists.loc[subtree_leaves, subtree_leaves].values.max()
    
    inter_dists = min_leaf_dists.values[in_subtree, :]
    inter_dists = np.ma.masked_array(inter_dists, np.broadcast_to(in_subtree[None, :], inter_dists.shape))
    
    # Min inter
    am_row, am_col = np.unravel_index(inter_dists.argmin(), inter_dists.shape)
    min_inter_dist = inter_dists[am_row, am_col]
    min_inter_tid = leaf_tids[am_col]
    
    # Infer threshold method
    if pd.isnull(threshold):
        threshold_method = None
    elif np.isclose(threshold, diam):
        threshold_method = 'diameter'
    elif np.isclose(threshold, .95 * min_inter_dist):
        threshold_method = 'min_inter'
    else:
        threshold_method = 'custom'
    
    # Main row
    main_rows.append((
        tid,
        taxon.name,
        'none' if taxon.rank is None else taxon.rank,
        taxon.isroot(),
        taxon.isleaf(),
        taxon.ncbi_id,
        threshold,
        diam,
        threshold_method,
        min_inter_dist,
        min_inter_tid,
        taxa_by_id[min_inter_tid].name,
        min_inter_dist <= diam,
    ))
    
    # Find all overlaps
    overlap_cols = np.flatnonzero(inter_dists.min(axis=0) <= diam)
    overlaps = dict()
    
    for col in overlap_cols:
        tid2 = leaf_tids[col]
        taxon2 = taxa_by_id[tid2]
        dmin = inter_dists[:, col].min()
        
        out_taxon = highest_out_ancestor(taxon, taxon2)
        overlaps[out_taxon] = min(overlaps.get(out_taxon, np.inf), dmin)

    # Overlap rows
    for overlap_taxon, dist in overlaps.items():
        overlap_rows.append((
            tid,
            taxon.name,
            diam,
            threshold,
            threshold_method,
            overlap_taxon.id,
            overlap_taxon.name,
            dist,
        ))

100%|██████████| 1917/1917 [00:02<00:00, 654.17it/s] 


In [15]:
main_df = pd.DataFrame(
    main_rows,
    columns=['id', 'name', 'rank', 'isroot', 'isleaf', 'ncbi_id', 'threshold', 'diameter', 'inferred_threshold_method',
             'min_inter_dist', 'min_inter_taxon_id', 'min_inter_taxon_name', 'min_inter_overlaps_diameter'],
)

main_df.set_index('id', inplace=True)
main_df['ncbi_id'] = fix_nullable_int_col(main_df['ncbi_id'])

In [16]:
overlaps_df = pd.DataFrame(
    overlap_rows,
    columns=['in_id', 'in_name', 'in_diameter', 'in_threshold', 'in_threshold_method', 'out_id', 'out_name', 'min_inter'],
)

overlaps_df.set_index(['in_id', 'out_id'], inplace=True)

In [17]:
assert not np.any(main_df['min_inter_dist'] <= main_df['threshold'])
assert set(main_df.index[main_df['min_inter_overlaps_diameter']]) == set(overlaps_df.index.levels[0])

## Inspect

In [18]:
main_df.shape[0]

1917

In [19]:
main_df.groupby('inferred_threshold_method', dropna=False).size()

inferred_threshold_method
custom         31
diameter     1661
min_inter     205
NaN            20
dtype: int64

In [20]:
main_df.groupby(['rank', 'inferred_threshold_method'], dropna=False).size()

rank     inferred_threshold_method
genus    diameter                      272
         min_inter                     180
none     custom                          7
         diameter                       37
         min_inter                       5
         NaN                             1
species  custom                         24
         diameter                     1352
         min_inter                      20
         NaN                            19
dtype: int64

In [21]:
main_df[main_df['inferred_threshold_method'] == 'custom']

Unnamed: 0_level_0,name,rank,isroot,isleaf,ncbi_id,threshold,diameter,inferred_threshold_method,min_inter_dist,min_inter_taxon_id,min_inter_taxon_name,min_inter_overlaps_diameter
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
525,Borrelia crocidurae,species,False,True,29520.0,0.4,0.489183,custom,0.473894,527,Borrelia duttonii,True
565,Campylobacter coli,species,False,True,195.0,0.7,0.857719,custom,0.806887,567,Campylobacter jejuni,True
605,Brucella ceti,species,False,True,120577.0,0.02,0.061127,custom,0.031776,604,Brucella pinnipedialis,True
629,Pseudomonas fluorescens,species,False,True,294.0,0.8,0.976624,custom,0.896555,648,Pseudomonas tolaasii,True
637,Pseudomonas parafulva,species,False,True,157782.0,0.6,0.941948,custom,0.718064,657,Pseudomonas fulva,True
639,Pseudomonas alcaligenes,species,False,True,43263.0,0.8,0.961824,custom,0.951335,628,Pseudomonas aeruginosa,True
823,Bordetella bronchiseptica,species,False,True,518.0,0.35,0.415677,custom,0.39393,824,Bordetella pertussis,True
899,Yersinia pseudotuberculosis,species,False,True,633.0,0.17,0.288188,custom,0.187679,897,Yersinia pestis,True
935,Vibrio crassostreae,species,False,True,246167.0,0.6,0.835725,custom,0.816172,959,Vibrio splendidus,True
964,Vibrio tasmaniensis,species,False,True,212663.0,0.7,0.872541,custom,0.833024,959,Vibrio splendidus,True


In [22]:
for rank, sdf in main_df.groupby('rank', dropna=False):
    print(f'rank={rank}: {sdf.min_inter_overlaps_diameter.sum()}/{sdf.shape[0]}')

rank=genus: 117/452
rank=none: 10/50
rank=species: 44/1415


In [23]:
overlaps_df.groupby('in_threshold_method', dropna=False).size()

in_threshold_method
custom         170
min_inter    10589
NaN            503
dtype: int64

In [24]:
overlaps_df[overlaps_df['in_threshold_method'] == 'custom']

Unnamed: 0_level_0,Unnamed: 1_level_0,in_name,in_diameter,in_threshold,in_threshold_method,out_name,min_inter
in_id,out_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
525,527,Borrelia crocidurae,0.489183,0.40,custom,Borrelia duttonii,0.473894
565,567,Campylobacter coli,0.857719,0.70,custom,Campylobacter jejuni,0.806887
605,599,Brucella ceti,0.061127,0.02,custom,Brucella canis,0.056766
605,602,Brucella ceti,0.061127,0.02,custom,Brucella neotomae,0.058924
605,604,Brucella ceti,0.061127,0.02,custom,Brucella pinnipedialis,0.031776
...,...,...,...,...,...,...,...
1898,647,Pseudomonas amygdali subgroup 2,0.330937,0.25,custom,Pseudomonas savastanoi,0.297822
1898,634,Pseudomonas amygdali subgroup 2,0.330937,0.25,custom,Pseudomonas syringae,0.302323
1901,1424,Streptococcus pseudopneumoniae subgroup 2,0.666717,0.50,custom,Streptococcus mitis,0.580635
1905,647,Pseudomonas syringae subgroup 4,0.388258,0.25,custom,Pseudomonas savastanoi,0.306273


## Write output

In [25]:
main_df.to_csv(outfiles['main_table'])

In [26]:
overlaps_df.to_csv(outfiles['overlaps_table'])