In [1]:
from pathlib import Path
import json

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from gambit.kmers import KmerSpec
from gambit.sigs.calc import calc_file_signatures
from gambit.sigs import SignatureArray
from gambit.io.seq import SequenceFile
from gambit.metric import jaccarddist_matrix

from entrez_tools.db.taxonomy import fetch_taxonomy_tree

In [4]:
%matplotlib inline

## Setup

In [5]:
infiles = dict(
    genomes_table=Path('../../data/processed/211129-update-external-data-sets/211201-konstantinidis-2005/211201-konstantinidis-2005-genomes.csv'),
    taxa_table=Path('../../data/processed/211129-update-external-data-sets/211201-konstantinidis-2005/211201-konstantinidis-2005-taxa.csv'),
    genomes_dir=Path('/home/jared/projects/gambit/data/external/konstantinidis-2005/genomes/'),
)

In [6]:
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

## Load data

In [7]:
genomes_df = pd.read_csv(infiles['genomes_table'])
ngenomes = genomes_df.shape[0]
ngenomes

70

In [8]:
taxa_df = pd.read_csv(infiles['taxa_table'])

## Taxonomy tree

### Assemble tree

In [9]:
%load_ext autoreload
%autoreload 1

In [10]:
%aimport gambit_analysis.trees
from gambit_analysis.trees import BasicIDTree as Tree, BasicIDNode as Node

In [11]:
class TaxonNode(Node):
    
    def _repr_node_(self, p):
        p.text('({taxid}) {name} [{rank}]'.format(**self.data))

In [12]:
_links = []

for i, row in taxa_df.iterrows():
    data = dict(row.items())
    parent_id = data['parent_taxid']
    _links.append((data['taxid'], None if parent_id == 0 else parent_id, data))
    
tree = Tree.from_tuples(_links, TaxonNode)

In [13]:
len(tree.nodes)

201

### Assign genomes

In [14]:
for node in tree.nodes.values():
    node.data['genome_idxs_direct'] = []

for gi, taxid in enumerate(genomes_df['taxid']):
    node = tree.nodes[taxid]
    node.data['genome_idxs_direct'].append(gi)

In [15]:
for node in tree.root.iter_subtree('post'):
    idxs = list(node.data['genome_idxs_direct'])
    for child in node.children:
        idxs.extend(child.data['genome_idxs'])
    node.data['genome_idxs'] = np.asarray(idxs)
    node.data['ngenomes'] = len(idxs)

In [16]:
assert np.array_equal(np.sort(tree.root.data['genome_idxs']), np.arange(ngenomes))

### Contract

In [17]:
def _copy_contract(node, parent2):
    while len(node.children) == 1:
        node = node.children[0]
    
    node2 = TaxonNode(node.id, node.data)
    if parent2 is not None:
        parent2._add_child(node2)
        
    for child in node.children:
        _copy_contract(child, node2)
    
    return node2

In [18]:
tree2 = Tree(_copy_contract(tree.root, None))

In [19]:
len(tree2.nodes)

113

## ?

### Signatures

In [20]:
seq_files = SequenceFile.from_paths([infiles['genomes_dir'] / f'{acc}.fa.gz' for acc in genomes_df['assembly_acc']], 'fasta', 'gzip')

In [21]:
kspec = KmerSpec(11, 'ATGAC')

In [22]:
sigs = calc_file_signatures(kspec, seq_files, progress=True)

100%|██████████| 70/70 [00:12<00:00,  5.53it/s]


In [23]:
sigs = SignatureArray(sigs)

### Distance matrix

In [24]:
dmat = jaccarddist_matrix(sigs, sigs)

In [25]:
def sub_dmat(inds): return dmat[np.ix_(inds, inds)]

## ?

In [26]:
for node in tree2.root.iter_subtree('post'):
    node.data['diameter'] = sub_dmat(node.data['genome_idxs']).max()
    
    nc = len(node.children)
    child_gidxs = [child.data['genome_idxs'] for child in node.children]
    child_diams = [child.data['diameter'] for child in node.children]
    md = node.data['child_min_dists'] = np.zeros((nc, nc))
    
    for i in range(nc):
        for j in range(nc):
            d = dmat[np.ix_(child_gidxs[i], child_gidxs[j])].min()
            md[i, j] = md[j, i] = d
            
    node.data['overlap'] = (md < child_diams) & ~np.identity(nc, dtype=bool)

In [30]:
for node in tree2.nodes.values():
    if node.data['overlap'].any():
        print('({taxid}) {name} [{rank}] {ngenomes}'.format(**node.data))
        for child in node.children:
            print('\t({taxid}) {name} [{rank}] {ngenomes}'.format(**child.data))

(2) Bacteria [superkingdom] 70
	(1224) Proteobacteria [phylum] 42
	(1783272) Terrabacteria group [clade] 28
(1224) Proteobacteria [phylum] 42
	(1236) Gammaproteobacteria [class] 27
	(28216) Betaproteobacteria [class] 9
	(210) Helicobacter pylori [species] 2
	(28211) Alphaproteobacteria [class] 4
(1236) Gammaproteobacteria [class] 27
	(91347) Enterobacterales [order] 15
	(446) Legionella pneumophila [species] 3
	(672) Vibrio vulnificus [species] 2
	(32033) Xanthomonadaceae [family] 4
	(286) Pseudomonas [genus] 3
(543) Enterobacteriaceae [family] 10
	(590) Salmonella [genus] 4
	(562) Escherichia coli [species] 4
	(42897) Shigella flexneri 2a [serotype] 2
(1783272) Terrabacteria group [clade] 28
	(1239) Firmicutes [phylum] 22
	(1760) Actinomycetia [class] 6
(1239) Firmicutes [phylum] 22
	(91061) Bacilli [class] 20
	(1502) Clostridium perfringens [species] 2
(91061) Bacilli [class] 20
	(1385) Bacillales [order] 11
	(1301) Streptococcus [genus] 9
(86661) Bacillus cereus group [species group

In [None]:
node.data

In [None]:
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform

In [None]:
link = linkage(squareform(dmat), 'complete')

In [None]:
sns.clustermap(dmat, row_linkage=link, col_linkage=link, cmap='jet')