### 210730 Arrange files

In [1]:
from pathlib import Path
import re

import pandas as pd

## Setup

In [2]:
data_dir = Path('/home/jared/projects/gambit/data/validation/200726-gold-standard-seqs/')

In [3]:
src_dir = data_dir / 'fasta-original'

In [4]:
dst_dir = data_dir / 'fasta'

## Get list of files

In [5]:
_parts = []

for date in ['200726', '200817']:
    d = src_dir / date
    _parts.append(pd.DataFrame.from_records(
        [(date, f, f.name) for f in d.glob('*.fasta')],
        columns=['set', 'original_path', 'original_filename'],
    ))

df = pd.concat(_parts, ignore_index=True)

### Parse old filenames

In [6]:
fname_pattern = r'([^-]+)(-([^_]+)|)(_(.+))_L001_R1_001 Assembly Contigs(Unpaired|)\.fasta'

_parts = pd.DataFrame.from_records(
    [re.fullmatch(fname_pattern, fname).group(1, 3, 5, 6) for fname in df['original_filename']],
    columns=['part1', 'part2', 'part3', 'unpaired'],
)
_parts['unpaired'] = _parts['unpaired'] != ''

df = pd.concat([df, _parts], axis=1)

## Check for duplicates

There are two pairs of files with identical part 1/2/3, one file unpaired and one not. Check if the files are identical.

In [7]:
duplicates = {key: df.index for key, df in df.groupby(['part1', 'part2', 'part3'], dropna=False) if df.shape[0] > 1}

In [8]:
for key, indices in duplicates.items():
    display(df.loc[indices])

Unnamed: 0,set,original_path,original_filename,part1,part2,part3,unpaired
24,200726,/home/jared/projects/gambit/data/validation/20...,17AC0001411B_S10_L001_R1_001 Assembly Contigs....,17AC0001411B,,S10,False
90,200817,/home/jared/projects/gambit/data/validation/20...,17AC0001411B_S10_L001_R1_001 Assembly ContigsU...,17AC0001411B,,S10,True


Unnamed: 0,set,original_path,original_filename,part1,part2,part3,unpaired
13,200726,/home/jared/projects/gambit/data/validation/20...,17AC0012455-1A_S1_L001_R1_001 Assembly Contigs...,17AC0012455,1A,S1,False
80,200817,/home/jared/projects/gambit/data/validation/20...,17AC0012455-1A_S1_L001_R1_001 Assembly Contigs...,17AC0012455,1A,S1,True


In [9]:
for key, indices in duplicates.items():
    files = df.loc[indices, 'original_path']
    msg = ! cmp --silent -- "{files[0]}" "{files[1]}" && echo identical || echo different
    print(key, msg.n)

('17AC0001411B', nan, 'S10') different
('17AC0012455', '1A', 'S1') different


## Rename/move files

In [10]:
_new_names = []

for row in df.itertuples():
    name = row.part1
    if row.part2:
        name += '-' + row.part2
    if row.part3:
        name += '_' + row.part3
    if row.unpaired:
        name += '_unpaired'
    
    _new_names.append(name)

df.insert(1, 'name', _new_names)

In [11]:
assert not df['name'].duplicated().any()

In [12]:
dst_dir.mkdir(exist_ok=True)

In [13]:
for row in df.itertuples():
    ! cp -f "{row.original_path}" "{dst_dir}/{row.name}.fasta"

## Save table

In [14]:
del df['original_path']

In [15]:
df.sort_values(['set', 'name'], inplace=True)

In [16]:
df.to_csv(data_dir / 'files.csv', index=False)