# 220419 Set 3 files list

In [1]:
from pathlib import Path
import re

In [2]:
import pandas as pd

## Setup

In [3]:
DATESTR = '220419'
NBNAME = f'{DATESTR}-set3-files-list'

In [4]:
_root = Path('/home/jared/projects/gambit/data/genomes/200728-gold-standard')

infiles = dict(
    root=_root,
    fasta1=_root / 'original/fasta-1',
    fasta2=_root / 'original/fasta-2',
    fastq=_root / 'fastq',
)

In [5]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

outfiles = dict(
    table=processed_out / f'{DATESTR}-200726-gold-standard-files.csv',
)

## Code

In [6]:
def fasta_to_id(fname):
    return re.fullmatch(r'(.*)_S\d+_L001_R1_001 Assembly Contigs(Unpaired)?.fasta', fname).group(1)

def md5sums(files):
    files_str = files if isinstance(files, str) else ' '.join(files)
    
    lines = !md5sum {files_str}

    sums = dict()

    for line in lines:
        md5, f = line.split(maxsplit=1)
        sums[f] = md5

    return sums

## Genomes table from FASTA files

In [7]:
_rows = []

for set_, dir_ in [('200726', infiles['fasta1']), ('200817', infiles['fasta2'])]:
    for f in dir_.glob('*.fasta'):
        _rows.append((set_, f.name))

df = pd.DataFrame(_rows, columns=['set', 'fasta_file'])

In [8]:
df['id'] = list(map(fasta_to_id, df['fasta_file']))

df = df.set_index(['set', 'id']).sort_index()

In [9]:
assert df.index.is_unique

### Checksums

In [10]:
df['fasta_md5'] = None

In [11]:
for f, md5 in md5sums(str(infiles['fasta1'] / '*.fasta')).items():
    id_ = fasta_to_id(f.split('/')[-1])
    df.loc[('200726', id_), 'fasta_md5'] = md5

In [12]:
for f, md5 in md5sums(str(infiles['fasta2'] / '*.fasta')).items():
    id_ = fasta_to_id(f.split('/')[-1])
    df.loc[('200817', id_), 'fasta_md5'] = md5

## FASTQ files

In [14]:
df['fastq_md5'] = None

for f, md5 in md5sums(str(infiles['fastq'] / '*.fastq.gz')).items():
    id_ = f.split('/')[-1].split('.')[0]
    df.loc[(slice(None), id_), 'fastq_md5'] = md5

In [15]:
df[df['fastq_md5'].isnull()]

Unnamed: 0_level_0,Unnamed: 1_level_0,fasta_file,fasta_md5,fastq_md5
set,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
200817,18AC0012154-B,18AC0012154-B_S7_L001_R1_001 Assembly Contigs....,02cf1a38d2f48a4aca3a678021d9f7c5,


## Output

In [16]:
df.to_csv(outfiles['table'])