# 220526 new genome files processing

In [1]:
from pathlib import Path
import shutil
import re

In [2]:
import pandas as pd

In [3]:
from gambit.util.io import read_lines

## Setup

In [4]:
DATESTR = '220526'
NBNAME = f'{DATESTR}-new-genome-files-processing'

In [5]:
infiles = dict(
    david_table='data-src/220520-david-set4-genomes-table.xlsx',
    src_dir=Path('/home/jared/projects/gambit/data/genomes/220422-nsphl-test-set/fasta-original'),
    set3_orig_dirs = [Path('/home/jared/projects/gambit/data/genomes/200726-gold-standard/original') / d for d in ['fasta-1', 'fasta-2']],
    set3_genomes='/home/jared/code/gambit/gambit-publication/resources/genomes/set3/genomes.txt',
)

In [6]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

In [7]:
outfiles = dict(
    assignments=processed_out / f'{DATESTR}-new-genome-file-assignments.csv',
    set4_table=processed_out / f'{DATESTR}-david-set4-table-files.csv',
    set3_files=Path('/home/jared/projects/gambit/data/genomes/200726-gold-standard/fasta-improved'),
    set4_files=Path('/home/jared/projects/gambit/data/genomes/220422-nsphl-test-set/fasta'),
)

## Code

In [8]:
identity = lambda x: x

In [9]:
def groupby(items, key=identity, value=identity):
    gb = dict()
    for x in items:
        k = key(x)
        v = value(x)
        gb.setdefault(k, []).append(v)
    return gb

In [10]:
def find_duplicates(items, key=identity, value=identity):
    gb = groupby(items, key, value)
    return {k: v for k, v in gb.items() if len(v) > 1}

In [11]:
import operator as op

get0 = op.itemgetter(0)
get1 = op.itemgetter(1)

In [12]:
def check_bijection(pairs):
    left = set()
    right = set()
    for l, r in pairs:
        assert l not in left, l
        assert r not in right, r
        left.add(l)
        right.add(r)
    return left, right

## Files

### Src files

In [13]:
_files = list(infiles['src_dir'].glob('*.fasta'))
_ids = [re.fullmatch(r'(.+?)(_contigs)?', f.stem).group(1) for f in _files]

src_files = dict(zip(_ids, _files))
src_ids = set(_ids)

assert len(src_ids) == len(_ids)

### David's Spreadsheet

In [14]:
david_table = pd.read_excel(infiles['david_table'])
david_table.rename(columns={'entity:miniseq_id': 'miniseq_id'}, inplace=True)
david_table['miniseq_id'] = david_table['miniseq_id'].astype(str)

In [15]:
_dup = david_table['miniseq_id'].duplicated(False)
david_table[_dup]

Unnamed: 0,miniseq_id,gambit_predicted_taxon,Closest Predicted Taxon,Alternative Result,est_coverage,genome_length,r1_mean_q,run_id
44,21-00368644,Campylobacter jejuni,,Campylobacter jejuni,43.8,1660282,35.87,NV-MN01149-210701
80,21-00368644,Enterobacter,,Enterobacter cloacae,424.87,5041048,35.91,NV-A01307-210721


In [16]:
(dup_miniseq_id,) = david_table.loc[_dup, 'miniseq_id'].unique()
dup_miniseq_id, dup_miniseq_id in src_ids

('21-00368644', True)

#### Reformat

Talked with David to decide on this numbering. The file labeled `21-00368644` in the set I just got is the "A" version.

In [17]:
set4_ids_override = {
    ('21-00368644', 'NV-A01307-210721'): '21-00368644A',
    ('21-00368644', 'NV-MN01149-210701'): '21-00368644B',
}

In [18]:
set4_table = david_table[['miniseq_id', 'run_id']].copy()

In [19]:
set4_table.index = pd.Series(
    [set4_ids_override.get((row.miniseq_id, row.run_id), row.miniseq_id) for _, row in set4_table.iterrows()],
    name='id',
)

In [20]:
assert set4_table.index.is_unique
set4_table.sort_index(inplace=True)

In [21]:
set4_ids = set(set4_table.index)

### Original Set 3 files

In [22]:
set3_orig_files = [f for d in infiles['set3_orig_dirs'] for f in d.glob('*.fasta')]

In [23]:
set3_orig = {re.fullmatch(r'(.+?)_S\d+_L001_R1_001.+', f.stem).group(1) for f in set3_orig_files}

In [24]:
set3 = {fname.split('.')[0] for fname in read_lines(infiles['set3_genomes'])}
assert set3 < set3_orig

In [25]:
set3_removed = set3_orig - set3

In [26]:
len(set3), len(set3_removed)

(88, 8)

## Make assignments

In [27]:
assignments = pd.DataFrame([(sid, sf.name) for sid, sf in src_files.items()], columns=['src_id', 'src_file'])
assignments.set_index('src_id', inplace=True)
assignments.sort_index(inplace=True)
assignments['dst_group'] = None
assignments['dst_id'] = None
assignments['notes'] = None

In [28]:
def isassigned(group=None): return ~assignments['dst_group'].isnull() if group is None else assignments['dst_group'] == group
def assigned_ids(group=None): return set(assignments.index[isassigned(group)])
def unassigned_ids(): return set(assignments.index[~isassigned()])
def assigned_to_ids(group=None): return set(assignments.loc[isassigned(group), 'dst_id'])

In [29]:
def assign(src_id, dst_group, dst_id, note=None):
    assert pd.isnull(assignments.loc[src_id, 'dst_group'])
    assignments.loc[src_id, ['dst_group', 'dst_id', 'notes']] = (dst_group, dst_id, note)

### Set 3

In [30]:
# Exact matches

for id_ in src_ids & set3:
    assign(id_, 'set3', id_)
    
len(assigned_ids('set3'))

39

In [31]:
# With characters removed from start

partial_set3_pairs = [(id2, id1) for id1 in unassigned_ids() for id2 in set3 if id2.endswith(id1)]
check_bijection(partial_set3_pairs)

for set3_id, src_id in partial_set3_pairs:
    assign(src_id, 'set3', set3_id, 'Characters missing from start')

len(partial_set3_pairs)

48

In [32]:
# To removed genomes, possibly with characters removed from start

set3_rm_pairs = [(id2, id1) for id1 in unassigned_ids() for id2 in set3_removed if id2.endswith(id1)]
assert len({id2 for id1, id2 in set3_rm_pairs}) == len(set3_rm_pairs)

for set3_id, src_id in set3_rm_pairs:
    assign(src_id, 'set3 removed', set3_id)

len(set3_rm_pairs)

7

### Set 4

In [33]:
# Exact matches:

for id_ in src_ids & set4_ids:
    assign(id_, 'set4', id_)

In [34]:
# "21" prefix changed to "20":

for src_id in unassigned_ids():
    if src_id.startswith('20'):
        id2 = '21' + src_id[2:]
        if id2 in set4_ids:
            assign(src_id, 'set4', id2, '2nd character incorrect')

In [35]:
# Duplicated miniseq ID - the one in this set is the "A" version

assign(dup_miniseq_id, 'set4', dup_miniseq_id + 'A')

## Finish up

In [36]:
assert not unassigned_ids()

### Check set 3

In [37]:
set3 - assigned_to_ids('set3')

{'19AC0011213'}

This file was provided separately.

### Check set 4

In [38]:
set4_table['src_file'] = None
set4_table['notes'] = None

In [39]:
for src_id, row in assignments.iterrows():
    if row.dst_group == 'set4':
        assert set4_table.loc[row.dst_id, 'src_file'] is None
        set4_table.loc[row.dst_id, 'src_file'] = src_files[src_id].name

In [40]:
# This file provided separately
set4_table.loc['21-00368644B', 'notes'] = 'provided separately'

In [41]:
# These two are being left out, apparently
set4_table.loc[['20-00249850', '21-00475253'], 'notes'] = 'Removed'

In [42]:
set4_table[set4_table['src_file'].isnull()]

Unnamed: 0_level_0,miniseq_id,run_id,src_file,notes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20-00249850,20-00249850,NV-MN01149-210701,,Removed
21-00368644B,21-00368644,NV-MN01149-210701,,provided separately
21-00475253,21-00475253,NV-MN01149-211203,,Removed


## Write output

In [43]:
assignments.to_csv(outfiles['assignments'])

In [44]:
set4_table.to_csv(outfiles['set4_table'])

## Copy files

In [45]:
for src_id, row in assignments.iterrows():
    if row.dst_group == 'set3':
        shutil.copyfile(src_files[src_id], outfiles['set3_files'] / f'{row.dst_id}.fasta')
    if row.dst_group == 'set4':
        shutil.copyfile(src_files[src_id], outfiles['set4_files'] / f'{row.dst_id}.fasta')