# 211015 Combine data

In [1]:
from pathlib import Path
import json
import re

In [2]:
import numpy as np
import pandas as pd
import xarray as xr
import h5py as h5
from tqdm import tqdm
from scipy.stats import pearsonr, spearmanr, kendalltau

## Setup

In [3]:
DATESTR = '211015'
NBNAME = DATESTR + '-combine-data'

In [4]:
infiles = dict(
    genomes=Path('data-processed/211011-get-genomes-list/211011-gambit-ani-additional-genomes.csv'),
    genomes_ondov=Path('../../data/processed/210902-mash-Escherichia-genomes/210902-get-genomes/210902-mash-genomes-filtered.csv'),
    genomes_gsg=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/files.csv'),
    genomes_gsg_results=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/results/211015-1.0b1.csv'),
    fastani={
        'konstantinidis_2005': Path('data-intermediate/211012-fastani/konstantinidis_2005.h5'),
        'snitkin_2012': Path('data-intermediate/211012-fastani/snitkin_2012.h5'),
        'ondov_2016': Path('../../data/intermediate/210902-mash-Escherichia-genomes/210904-fastani/ani-pairwise.h5'),
        '200726_gold_standard': Path('data-intermediate/211015-fastani-gsg/200726_gold_standard.h5'),
    },
    gambit_params=Path('../../data/intermediate/210902-mash-Escherichia-genomes/210917-gambit/params.csv'),
    gambit_dists={
        'konstantinidis_2005': Path('data-intermediate/211012-gambit/konstantinidis_2005.h5'),
        'snitkin_2012': Path('data-intermediate/211012-gambit/snitkin_2012.h5'),
        'ondov_2016': Path('../../data/intermediate/210902-mash-Escherichia-genomes/210917-gambit/pairwise-dists.h5'),
        '200726_gold_standard': Path('data-intermediate/211015-gambit-gsg/200726_gold_standard.h5'),
    },
)

In [5]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

outfiles = dict(
    genomes=processed_out / f'{DATESTR}-gambit-ani-genomes.csv',
    data=intermediate_out / 'data.nc',
)

## Code

In [6]:
def get_h5_datasets(group):
    return {name: child[:] for name, child in group.items()}

## Genomes table

In [7]:
genomes = pd.read_csv(
    infiles['genomes'],
    index_col=[0, 1],
    dtype=dict(assembly_uid=str),
)

### Add ondov-2016

In [8]:
_ondov = pd.read_csv(infiles['genomes_ondov']) \
    .rename(columns=dict(index='ds_index', organism='description')) \
    .drop(columns=(['mash_index', 'url']))

_ondov['id'] = _ondov['assembly_accession']
_ondov['data_set'] = 'ondov_2016'
_ondov.set_index(['data_set', 'ds_index'], inplace=True)

genomes = genomes.append(_ondov)

### Add 200726 gold standard

In [9]:
_gsg = pd.read_csv(infiles['genomes_gsg'])

_gsg = _gsg.loc[_gsg['set'] == 200726, ['name']]

_gsg.rename(columns=dict(name='id'), inplace=True)

_gsg.index.name = 'ds_index'
_gsg.reset_index(inplace=True)
_gsg['data_set'] = '200726_gold_standard'
_gsg.set_index(['data_set', 'ds_index'], inplace=True)

Use recent GAMBIT query results on genomes to get estimated species

In [10]:
_gsg_name2index = {name: i for i, name in enumerate(_gsg['id'])}
_gsg_result2row = dict()
_gsg_row2result = dict()
_gsg_assigned = {}

_gsg_results = pd.read_csv(infiles['genomes_gsg_results'])

for i, row in _gsg_results.iterrows():
    try:
        name = re.sub(r'\.fasta$', '', row['query.name'])
        j = _gsg_name2index[name]
    except KeyError:
        continue
        
    assert j not in _gsg_row2result
    _gsg_row2result[j] = i
    assert i not in _gsg_result2row
    _gsg_result2row[i] = j
    
assert len(_gsg_result2row) == _gsg.shape[0]
assert len(_gsg_row2result) == _gsg.shape[0]

Use description of closest genome where no prediction was made

In [11]:
_gsg['description'] = None

for i, j in _gsg_row2result.items():
    _res = _gsg_results.iloc[j]
    desc = _res['predicted.name']
    
    if pd.isnull(desc):
        species = re.fullmatch(r'\[.*\] (\w+ \w+).*', _res['closest.description']).group(1)
        desc = f'[closest] {species}'
    else:
        desc = f'[predicted] {desc}'
        
    _gsg['description'].iloc[i] = desc

In [12]:
genomes = genomes.append(_gsg)

### Finish

In [13]:
gsets = genomes.index.levels[0]
ngs = len(gsets)

In [14]:
ngenomes = {k: df.shape[0] for k, df in genomes.groupby('data_set')}
ngenomes

{'200726_gold_standard': 80,
 'konstantinidis_2005': 70,
 'ondov_2016': 492,
 'snitkin_2012': 20}

## Load data

### FastANI

In [15]:
fastani_data = dict()

for gset in gsets:
    with h5.File(infiles['fastani'][gset]) as f:
        # Ondov did not have "pw" and "matrix" subgroups
        grp = f if gset == 'ondov_2016' else f['pw']
        fastani_data[gset] = get_h5_datasets(grp)

### GAMBIT

In [16]:
params_df = pd.read_csv(infiles['gambit_params'])
params_df.index.name = 'param'

nparams = params_df.shape[0]

In [17]:
(gambitdb_param,) = np.flatnonzero((params_df['prefix'] == 'ATGAC') & (params_df['k'] == 11))

In [18]:
gambit_data = dict()

for gset in gsets:
    with h5.File(infiles['gambit_dists'][gset]) as f:
        gambit_data[gset] = get_h5_datasets(f)

## Combine data sets

In [19]:
common_ds = xr.Dataset.from_dataframe(params_df)

In [20]:
# Separate Datasets for storing per-genome information
genome_dss = dict()

for gset in gsets:
    ad = fastani_data[gset]
    gd = gambit_data[gset]
    ng = ngenomes[gset]
    
    # ANI data
    adf = pd.DataFrame.from_dict(ad)
    adf.set_index(['genome1', 'genome2'], inplace=True)
    adf.index.names = ['g1', 'g2']
    adf.index.name = 'pair'
    ds = xr.Dataset(adf)
    
    if gset == 'ondov_2016':
        # Missing these variables
        reported = xr.DataArray(np.ones(ds.dims['pair'], dtype=bool), dims=['pair'])
        ds['reported_both'] = reported
        ds['reported_q1r2'] = reported
        ds['reported_q2r1'] = reported
        
    else:
        # ANI is in percent
        for name, var in ds.data_vars.items():
            if name.startswith('ani_'):
                var /= 100
    
    # Genome indices
    g1 = ds['g1']
    g2 = ds['g2']
    assert len(g1) == len(g2) == ng * (ng + 1) // 2
    
    # GAMBIT data
    ds['kmer_count'] = xr.Variable(['param', 'genome'], gd['kmer_counts'])
    
    gdists = gd['pw_dists']
    gg1 = gd['genome1']
    gg2 = gd['genome2']
    
    # Ondov-2016 gambit data formatted differently
    if gset == 'ondov_2016':
        # Saved with 1-based indexing
        gg1 = gg1 - 1
        gg2 = gg2 - 1
        
        # Doesn't contain pairs on the diagonal - need to reindex
        gdists = xr.DataArray(
            gdists,
            dims=['param', 'pair'],
            coords=dict(pair=pd.MultiIndex.from_arrays([gg1, gg2], names=['g1', 'g2'])),
        )
        ds['dist'] = gdists.reindex(pair=ds.indexes['pair'], fill_value=1)

    else:
        assert np.array_equal(g1, gg1)
        assert np.array_equal(g2, gg2)
        ds['dist'] = xr.Variable(['param', 'pair'], gdists)
        
    # Attributes
    ds.attrs['data_set'] = gset
    
    genome_dss[gset] = ds

## GAMBIT-ANI correlation

In [21]:
_pcorr = []
_scorr = []
_kcorr = []

for gset in tqdm(gsets):
    ds = genome_dss[gset]
    mask = ds['corr_mask'] = (ds['g1'] != ds['g2']) & ds['reported_both']
    y = -ds['ani_mean'][mask]
    
    gb = ds['dist'].groupby('param')
    _pcorr.append(gb.map(lambda x: xr.DataArray(pearsonr(x[mask], y)[0])))
    _scorr.append(gb.map(lambda x: xr.DataArray(spearmanr(x[mask], y)[0])))
    _kcorr.append(gb.map(lambda x: xr.DataArray(kendalltau(x[mask], y)[0])))

100%|██████████| 4/4 [00:14<00:00,  3.61s/it]


In [22]:
common_ds['ani_pearson'] = xr.concat(_pcorr, gsets)
common_ds['ani_spearman'] = xr.concat(_scorr, gsets)
common_ds['ani_kendalltau'] = xr.concat(_kcorr, gsets)

## Save

### Genomes

In [23]:
genomes.to_csv(outfiles['genomes'])

### Main data

In [24]:
save_kw = dict(format='NETCDF4')

In [25]:
common_ds.to_netcdf(outfiles['data'], 'w', group='common', **save_kw)

In [26]:
for gset, ds in genome_dss.items():
    ds = ds.reset_index('pair')
    ds.to_netcdf(outfiles['data'], 'a', group=f'genome/{gset}', **save_kw)