# 211016 additional genome stats

In [1]:
from pathlib import Path
import json
import sys

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from Bio import Entrez

In [3]:
from gambit.io.seq import SequenceFile
import entrez_tools as ez
from entrez_tools.db import assembly

In [4]:
sys.path.insert(0, './src')
from esummary_store import BasicEsummaryStore

## Setup

In [5]:
DATESTR = '211016'
NBNAME = DATESTR + '-additional-genome-stats'

In [6]:
tmpdir = Path('tmp')

infiles = dict(
    genomes=Path('data-processed/211015-combine-data/211015-gambit-ani-genomes.csv'),
    assembly_summaries=Path('/home/jared/projects/gambit/data/ncbi/assembly/esummary/'),
    assembly_seqs=Path('/home/jared/projects/gambit/data/ncbi/assembly/seqs/'),
    gsg_seqs=Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/fasta'),
)

In [7]:
data_processed = Path('data-processed') / NBNAME
data_processed.mkdir(exist_ok=True)

outfiles = dict(
    meta=data_processed / f'{DATESTR}-ncbi-assembly-meta.csv',
    stats=data_processed / f'{DATESTR}-assembly-stats.csv',
)

In [8]:
Entrez.email = 'mjlumpe@gmail.com'
Entrez.api_key = 'adea2a53e6e44c3cc42c932f9ba4bdffa809'

## Code

In [9]:
def contig_stats(lengths):
    """Calculate n50 and l50."""
    lengths = sorted(lengths, reverse=True)
    nnucs = sum(lengths)
    mid = nnucs / 2
    total = 0
    
    for i, l in enumerate(lengths):
        total += l
        if total < mid:
            continue

        if total == mid:
            n50 = (l + lengths[i+1]) / 2
        elif total > mid:
            n50 = l
            
        return nnucs, len(lengths), n50, i + 1
    
    assert 0

## Load data

In [10]:
genomes = pd.read_csv(infiles['genomes'], index_col=[0, 1], dtype=dict(assembly_uid=str))

In [11]:
in_gsg = genomes.index.get_level_values(0).to_series() == '200726_gold_standard'
in_gsg.index = genomes.index

In [12]:
genomes['file'] = None

_paths1 = [infiles['assembly_seqs'] / f'{acc}.fasta.gz' for acc in genomes.loc[~in_gsg, 'assembly_accession']]
genomes.loc[~in_gsg, 'file'] = SequenceFile.from_paths(_paths1, 'fasta', 'gzip')

_paths2 = [infiles['gsg_seqs'] / f'{name}.fasta.gz' for name in genomes.loc[in_gsg, 'id']]
genomes.loc[in_gsg, 'file'] = SequenceFile.from_paths(_paths2, 'fasta', 'gzip')

assert all(f.path.is_file() for f in genomes['file'])

## Get assembly ESummary data

In [13]:
esummaries = BasicEsummaryStore('assembly', '/home/jared/projects/gambit/data/ncbi/assembly/esummary/')

### Download missing

In [14]:
_to_download = set(genomes.loc[~in_gsg, 'assembly_uid']) - set(esummaries)

for uid in tqdm(_to_download):
    with Entrez.esummary(db='assembly', id=uid, retmode='json') as f:
        data = json.load(f)
    summary = ez.get_esummary_result_json(data)
    esummaries.add(summary)

0it [00:00, ?it/s]

### Extract assembly metadata

In [15]:
meta_df = pd.DataFrame.from_records(
    [assembly.format_summary_meta(esummaries[uid]['meta']) for uid in genomes.loc[~in_gsg, 'assembly_uid']],
    index=genomes.index[~in_gsg],
)

In [16]:
meta_numeric = meta_df.select_dtypes(include=int).melt(ignore_index=False)

## Calculate stats from sequence files

In [17]:
contig_lengths = []

for file in tqdm(genomes['file']):
    with file.parse() as records:
        contig_lengths.append([len(rec.seq) for rec in records])

  0%|          | 0/662 [00:00<?, ?it/s]

In [18]:
stats_df = pd.DataFrame.from_records(
    map(contig_stats, contig_lengths),
    columns=['size', 'ncontigs', 'n50', 'l50'], 
    index=genomes.index,
)

### Check against NCBI metadata

In [19]:
n_matches = stats_df.loc[~in_gsg, 'ncontigs'] == meta_df['scaffold_count']
all(n_matches)

True

In [20]:
n50_matches = stats_df.loc[~in_gsg, 'n50'] == meta_df['scaffold_n50']
all(n50_matches)

True

In [21]:
l50_matches = stats_df.loc[~in_gsg, 'l50'] == meta_df['scaffold_l50']
all(l50_matches)

False

Have mismatches, inspect:

In [22]:
meta_df.loc[~l50_matches, ['scaffold_n50', 'scaffold_l50']]

Unnamed: 0_level_0,Unnamed: 1_level_0,scaffold_n50,scaffold_l50
data_set,ds_index,Unnamed: 2_level_1,Unnamed: 3_level_1
ondov_2016,414,5549,318


In [23]:
stats_df[~in_gsg][~l50_matches]

Unnamed: 0_level_0,Unnamed: 1_level_0,size,ncontigs,n50,l50
data_set,ds_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ondov_2016,414,9030472,2606,5549,317


The l50 is off by one here, not sure why but all others match.

## Write output

In [24]:
meta_df.to_csv(outfiles['meta'])

In [25]:
stats_df.to_csv(outfiles['stats'])