# 220518 NSPHL test genomes processing

In [1]:
from pathlib import Path
import re

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from Bio import SeqIO

## Setup

In [3]:
DATESTR = '220518'
NBNAME = f'{DATESTR}-nsphl-test-genomes-processing'

In [4]:
infiles = dict(
    fasta_dir=Path('/home/jared/projects/gambit/data/genomes/220422-nsphl-test-set/fasta'),
)

In [5]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

In [6]:
outfiles = dict(
    md5=processed_out / f'{DATESTR}-nsphl-test-genomes-md5.csv',
)

## Get file list

In [7]:
files = {file.stem: file for file in infiles['fasta_dir'].glob('*.fasta')}

len(files)

453

## Check sequence data is valid

In [8]:
invalid_nuc = re.compile('[^ATGCN]', flags=re.IGNORECASE)
invalid_nuc_bytes = re.compile(b'[^ATGCN]', flags=re.IGNORECASE)

In [9]:
valid = dict()

In [10]:
for _id, file in tqdm(files.items()):
    if valid.get(_id, False):
        continue

    for i, record in enumerate(SeqIO.parse(file, 'fasta')):
        match = invalid_nuc_bytes.search(record.seq._data)
        if match:
            print(f'Invalid nucleotide {match.group(0)} in record {i} {record.id} of {_id}')
            valid[_id] = False
            break
            
    else:
        valid[_id] = True

  0%|          | 0/453 [00:00<?, ?it/s]

In [11]:
assert all(valid.values())

## MD5 hashes

In [12]:
lines = !md5sum {infiles['fasta_dir']}/*

In [13]:
hashes = dict()

for l in lines:
    md5, fpath = l.split(maxsplit=1)
    id_ = Path(fpath).stem
    hashes[id_] = md5

## Write output

In [14]:
df = pd.DataFrame(index=pd.Series(sorted(files), name='id'))

In [15]:
df['md5'] = [hashes[id_] for id_ in df.index]

In [16]:
df.to_csv(outfiles['md5'])