# 211003 additional genome stats

In [1]:
from pathlib import Path
import json

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
from gambit.kmers import KmerSpec
from gambit.io.seq import SequenceFile

## Setup

In [4]:
DATESTR = '211003'
NBNAME = DATESTR + '-additional-genome-stats'

In [5]:
tmpdir = Path('tmp')

infiles = dict(
    genomes=tmpdir / 'genome_files.csv',
)

In [6]:
nb_tmpdir = tmpdir / NBNAME
nb_tmpdir.mkdir(exist_ok=True)

tmpfiles = dict(
    fasta_stats=nb_tmpdir / 'fasta_stats.json',
    fastq_stats=nb_tmpdir / 'fastq-stats/',
)

In [7]:
data_processed = Path('data-processed') / NBNAME
data_processed.mkdir(exist_ok=True)

outfiles = dict(
    genome_stats=data_processed / f'{DATESTR}-additional-genome-stats.csv'
)

## Load data

In [8]:
genomes_df = pd.read_csv(infiles['genomes'])

In [9]:
genome_names = genomes_df['name']
fasta_files = SequenceFile.from_paths(genomes_df['fasta_file'], 'fasta')
fastq_files = SequenceFile.from_paths(genomes_df['fastq_file'], 'fastq', compression='gzip')

## FASTA

In [10]:
if not tmpfiles['fasta_stats'].is_file():
    data = dict()

    for name, file in tqdm(zip(genome_names, fasta_files)):
        lens = [len(record.seq) for record in file.parse()]
        data[name] = dict(contig_lengths=lens)
 
    with tmpfiles['fasta_stats'].open('w') as f:
        json.dump(data, f)

## FASTQ

In [11]:
tmpfiles['fastq_stats'].mkdir(exist_ok=True)

In [13]:
for i, file in enumerate(tqdm(fastq_files)):
    name = genomes_df['name'].iloc[i]
    file = tmpfiles['fastq_stats'] / f'{name}.json'
    if file.is_file():
        continue

    nreads = 0
    total_len = 0

    for record in file.parse():
        nreads += 1
        total_len += len(record.seq)

    stats = dict(nreads=nreads, total_len=total_len)
    with open(file, 'w') as f:
        json.dump(stats, f)

  0%|          | 0/80 [00:00<?, ?it/s]

## Compile statistics into table

In [14]:
def calc_n50(lengths):
    lengths = sorted(lengths)
    mid = sum(lengths) / 2
    total = 0
    
    for i, l in enumerate(lengths):
        total += l
        if total == mid:
            return (l + lengths[i+1]) / 2
        elif total > mid:
            return l
    
    assert 0

In [15]:
with open(tmpfiles['fasta_stats']) as f:
    genome_stats = json.load(f)

for name, stats in genome_stats.items():
    with open(tmpfiles['fastq_stats'] / (name + '.json')) as f:
        stats.update(json.load(f))

In [16]:
_rows = []

for name, stats in genome_stats.items():
    _rows.append(dict(
        name=name,
        assembly_size=sum(stats['contig_lengths']),
        assembly_ncontigs=len(stats['contig_lengths']),
        assembly_n50=calc_n50(stats['contig_lengths']),
        fastq_nreads=stats['nreads'],
        fastq_nnucs=stats['total_len'],
    ))
    
stats_df = pd.DataFrame.from_records(_rows)

In [17]:
stats_df['est_coverage'] = stats_df['fastq_nnucs'] / stats_df['assembly_size']

In [18]:
stats_df.to_csv(outfiles['genome_stats'], index=False)