In [None]:
from pathlib import Path
import json
from gzip import GzipFile

In [2]:
import numpy as np
import pandas as pd
import h5py
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [3]:
from midas.db.models import ReferenceGenomeSet, AnnotatedGenome, Taxon
from midas.io.signaturefile import SignatureFile

## Setup

In [4]:
DATESTR = '210303'
NBNAME = DATESTR + '-format-data'

In [5]:
midas_root_dir = Path('/home/jared/projects/midas/')

infiles = {
    'signatures': midas_root_dir / 'data/2019_20/refseq_curated_1.1beta_200604.midas-signatures.gz',
    'database': midas_root_dir / 'data/2019_20/refseq_curated_1.2a_201221.db',
    'v11_genomes': midas_root_dir / 'notebooks/midas-notebooks-2019/build-v1-database/out/3-curated-taxonomy-assignments.csv',
    'v11_overlaps': Path('../200727-find-overlaps/data-intermediate/200729-refseq-curated-1.1_beta-species-overlaps.h5'),
    'migration': Path('../201031-database-v1.1-software-version-migration/data-processed/201218-final-taxonomy-assignments'),
}

In [6]:
intermediate_out = Path('data-intermediate') / NBNAME
intermediate_out.mkdir(exist_ok=True)

## Func defs

In [7]:
def itertuples(df, columns):
    """Iterate over (index, colvals) pairs for selected columns of a DataFrame."""
    for i, row in df.iterrows():
        yield i, tuple(row[columns])

In [8]:
def to_1_indexed(df, check=True):
    """Change a dataframe's RangeIndex start from 0 to 1 for julia compatibility."""
    if check:
        assert isinstance(df.index, pd.RangeIndex) and df.index.start == 0
    df.index = pd.RangeIndex(1, df.shape[0] + 1)

## v1.1 Genomes and taxonomy

Overlap data refers to species by index, this is in the list of all `(genus, species)` tuples sorted lexicographically.

In [9]:
genomes_df = pd.read_csv(infiles['v11_genomes'])
genomes_df.drop(columns='key', inplace=True)
to_1_indexed(genomes_df)

In [10]:
ngenomes = genomes_df.shape[0]
ngenomes

50752

In [11]:
assert np.array_equal(genomes_df['accession'], sorted(genomes_df['accession']))

In [12]:
genome_acc_to_index1 = {acc: i for i, acc in genomes_df['accession'].iteritems()}

In [13]:
species_v11_df = genomes_df.groupby(['genus', 'species']).size().to_frame('ngenomes')
species_v11_df.sort_index(inplace=True)
species_v11_df.reset_index(inplace=True)
to_1_indexed(species_v11_df, False)

In [14]:
nspecies_v11 = species_v11_df.shape[0]
nspecies_v11

1438

In [15]:
species_v11_to_index1 = {sp: i for i, sp in itertuples(species_v11_df, ['genus', 'species'])}

In [16]:
genomes_df['species_v11_idx1'] = \
    [species_v11_to_index1[sp] for i, sp in itertuples(genomes_df, ['genus', 'species'])]
genomes_df.drop(columns=['genus', 'species'], inplace=True)

## Database 1.2

### Open

In [17]:
engine = create_engine('sqlite:///%s' % infiles['database'])
Session = sessionmaker(engine)

In [18]:
session = Session()

In [19]:
gset = session.query(ReferenceGenomeSet).one()

print(gset.name, gset.key, gset.version)

refseq_curated_1.2a1 midas/assembly/curated 1.2a1


### Taxa

In [20]:
_rows = [
    (taxon.id, taxon.ncbi_id, taxon.name, taxon.parent_id or 0)
    for taxon in gset.taxa.filter_by(rank='species')
]

species_v12_df = pd.DataFrame.from_records(_rows, columns=['db_id', 'taxid', 'name', 'genus_id'])
species_v12_df.sort_values(['genus_id', 'db_id'], inplace=True)
to_1_indexed(species_v12_df, False)

In [21]:
nspecies_v12 = species_v12_df.shape[0]
nspecies_v12

1438

In [22]:
species_v12_id_to_index1 = {id_: i for i, id_ in species_v12_df['db_id'].iteritems()}
species_v12_taxid_to_index1 = {id_: i for i, id_ in species_v12_df['taxid'].iteritems()}

In [23]:
_rows = [
    (taxon.id, taxon.ncbi_id, taxon.name)
    for taxon in gset.taxa.filter_by(rank='genus')
]

genera_v12_df = pd.DataFrame.from_records(_rows, columns=['db_id', 'ncbi_id', 'name'])
genera_v12_df.sort_values('db_id', inplace=True)
to_1_indexed(genera_v12_df, False)

In [24]:
genera_v12_df['nspecies'] = species_v12_df.groupby('genus_id').size().drop(index=0)

### Genomes

In [25]:
genomes_df['species_v12_idx1'] = 0

for ag in gset.genomes:
    idx1 = genome_acc_to_index1[ag.refseq_acc]
    sp_idx1 = species_v12_id_to_index1[ag.primary_taxon_id]
    genomes_df.loc[idx1, 'species_v12_idx1'] = sp_idx1

In [26]:
species_v12_df['ngenomes'] = genomes_df.groupby('species_v12_idx1').size().drop(index=0)

In [27]:
genera_v12_df['ngenomes'] = species_v12_df.groupby('genus_id')['ngenomes'].sum().drop(index=0)

## Migration

### Count genome transitions

In [28]:
species_migration_counts = genomes_df.groupby(['species_v11_idx1', 'species_v12_idx1']).size()

In [29]:
species_v11_dst_indices1 = species_migration_counts \
    .reset_index() \
    .groupby('species_v11_idx1')['species_v12_idx1'] \
    .unique()

assert np.array_equal(species_v11_dst_indices1.index, species_v11_df.index)

In [30]:
species_v12_src_indices1 = species_migration_counts \
    .reset_index() \
    .groupby('species_v12_idx1')['species_v11_idx1'] \
    .unique() \
    .drop(index=0)

assert np.array_equal(species_v12_src_indices1.index, species_v12_df.index)

### Annotate v1.1 species

In [31]:
species_v11_df['migration_dst_idxs1'] = [', '.join(map(str, set(idxs).difference([0]))) for idxs in species_v11_dst_indices1]

In [32]:
species_v11_df['migration_ndropped'] = 0

for i, n in species_migration_counts.loc[:, 0].iteritems():
    species_v11_df.loc[i, 'migration_ndropped'] = n

In [33]:
species_v11_df['migration_single_dst'] = False
species_v11_df['migration_1to1'] = False

for i, idxs in species_v11_dst_indices1.iteritems():
    idxs = set(idxs).difference([0])
    
    if len(idxs) == 1:
        (idx,) = idxs
        species_v11_df.loc[i,'migration_single_dst'] = True
        species_v11_df.loc[i, 'migration_1to1'] = np.array_equal(species_v12_src_indices1.loc[idx], [i])
    
    else:
        species_v11_df.loc[i, 'migration_single_dst'] = False
        species_v11_df.loc[i, 'migration_1to1'] = False

In [34]:
species_v11_df['migration_identical'] = species_v11_df['migration_1to1'] & (species_v11_df['migration_ndropped'] == 0)

### Annotate v1.2 species

In [35]:
species_v12_df['migration_src_idxs1'] = [', '.join(map(str, idxs)) for idxs in species_v12_src_indices1]

species_v12_df['migration_single_src'] = [len(idxs) == 1 for i, idxs in species_v12_src_indices1.iteritems()]

species_v12_df['migration_identical'] = [
    len(idxs) == 1 and np.array_equal(species_v11_dst_indices1[idxs[0]], [i])
    for i, idxs in species_v12_src_indices1.iteritems()
]

species_v12_df['migration_1to1'] = [
    len(idxs) == 1 and set(species_v11_dst_indices1[idxs[0]]).issubset([i, 0])
    for i, idxs in species_v12_src_indices1.iteritems()
]

## v1.1 Overlaps

This was created in Julia so it is 1-indexed.

In [36]:
_f = h5py.File(infiles['v11_overlaps'], 'r')

overlap_components_vec = _f['overlap_components'][:]

In [37]:
ncomps = overlap_components_vec.max()
ncomps

41

In [38]:
overlap_components_1 = [[] for i in range(ncomps)]

for (i, c) in enumerate(overlap_components_vec):
    if c > 0:
        overlap_components_1[c-1].append(i + 1)

## Save

In [39]:
genomes_df.to_csv(intermediate_out / 'genomes-v1.1.csv', index=False)

In [40]:
species_v11_df.to_csv(intermediate_out / 'species-v1.1.csv', index=False)

In [41]:
species_v12_df.to_csv(intermediate_out / 'species-v1.2.csv', index=False)
genera_v12_df.to_csv(intermediate_out / 'genera-v1.2.csv', index=False)

In [42]:
with open(intermediate_out / 'overlap-components-v1.1.json', 'w') as f:
    json.dump(overlap_components_1, f)

In [43]:
species_migration_counts.to_frame('ngenomes').to_csv(intermediate_out / 'migration-genome-reassignment-counts.csv')