# 221004 Set4 AG Data

In [1]:
from pathlib import Path

In [2]:
import pandas as pd

## Setup

In [3]:
DATESTR = '221004'
NBNAME = f'{DATESTR}-set4-ag-data'

In [4]:
infiles = dict(
    set4=Path('~/code/gambit/gambit-publication/resources/genomes/set4/genomes.csv'),
    ag=Path('src/220923-GAMBIT-Species-ID-Comparison-AG.xlsx'),
)

In [5]:
data_processed = Path('data-processed') / NBNAME
data_processed.mkdir(exist_ok=True)

outfiles = dict(
    table=data_processed / f'{DATESTR}-ag-data.csv',
)

## Load data

In [6]:
set4 = pd.read_csv(infiles['set4'], index_col=0)
assert set4.index.is_unique

In [7]:
ag = pd.read_excel(infiles['ag'], sheet_name=2)

del ag['Temp']  # Don't think this means anything

ag = ag.astype({
    'entity:miniseq_id': str,
    'genome_length': pd.Int64Dtype(),
})

ag.set_index('entity:miniseq_id', inplace=True)
assert ag.index.is_unique

## Match up rows

In [8]:
id_map = {gid: gid for gid in set4.index if gid in ag.index}
len(id_map), set4.shape[0]

(603, 605)

In [9]:
set4[~set4.index.isin(id_map.values())]

Unnamed: 0_level_0,n_contigs,total_length,N50,L50,md5,filename
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
21-00368644A,47,5043096,418498,5,d68d05ae6b8a83d9836eb5d4518bc2f2,21-00368644A.fasta.gz
21-00368644B,70,1665864,78104,8,8b06a215189922bbf644004cd758d3a5,21-00368644B.fasta.gz


In [10]:
ag[ag.index.str.startswith('21-00368644')]

Unnamed: 0_level_0,biosample_accession,submission_id,amrfinderplus_amr_genes,ts_mlst_predicted_st,gambit_predicted_taxon,city,received_date,Provider,collection_date,dataset,...,isolation_type,lat_lon,library_ID,library_layout,library_selection,library_source,library_strategy,organism,platform,title
entity:miniseq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21-00368644,SAMN30861796,NV_NSPHL_0000329,"oqxA,oqxB,aph(3')-Ia,fosA,blaACT-17,blaKPC-2,b...",ST204,Campylobacter jejuni,not collected,2021-06-21,Lemak,2021-06-17,368644-NV-A01307-210721,...,Clinical,not collected,NV_NSPHL_0000329,paired,RANDOM,GENOMIC,WGS,Campylobacter jejuni,ILLUMINA,WGS of HAIs
21-00368644b,,,blaOXA,ST50,Campylobacter jejuni,,NaT,,NaT,CL2021-00368644-NV-MN01149-210701,...,,,,,,,,,,


In [11]:
# Just guessing here, but this seems right
id_map['21-00368644A'] = '21-00368644'
id_map['21-00368644B'] = '21-00368644b'

In [12]:
assert set(id_map.keys()) == set(set4.index)
assert set(id_map.values()).issubset(ag.index)

In [13]:
ag2 = ag.loc[[id_map[set4_id] for set4_id in set4.index]]
ag2 = ag2.reset_index().set_index(set4.index)

## Inspect

In [14]:
bs_acc = ag2['biosample_accession']

### Duplicated biosample

In [15]:
ag2[bs_acc.duplicated(keep=False) & ~bs_acc.isnull()]

Unnamed: 0_level_0,entity:miniseq_id,biosample_accession,submission_id,amrfinderplus_amr_genes,ts_mlst_predicted_st,gambit_predicted_taxon,city,received_date,Provider,collection_date,...,isolation_type,lat_lon,library_ID,library_layout,library_selection,library_source,library_strategy,organism,platform,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PNUSAE019069,PNUSAE019069,SAMN10182651,,"emrD,aadA1,sat2,dfrA1,blaEC,blaOXA-1,catA1,aph...",ST245,Shigella flexneri,,NaT,,NaT,...,,,,,,,,,,
PNUSAE020883,PNUSAE020883,SAMN10182651,,"erm(B),emrD,qnrS1,tet(B),blaEC,mph(A),sul1,aad...",ST245,Shigella flexneri,,NaT,,NaT,...,,,,,,,,,,


### Biosample accession pattern

In [16]:
ag2[~bs_acc.str.match(r'SAMN\d+', na=True)]

Unnamed: 0_level_0,entity:miniseq_id,biosample_accession,submission_id,amrfinderplus_amr_genes,ts_mlst_predicted_st,gambit_predicted_taxon,city,received_date,Provider,collection_date,...,isolation_type,lat_lon,library_ID,library_layout,library_selection,library_source,library_strategy,organism,platform,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21-00372063,21-00372063,QC Fail,,"acrF,emrD,blaEC,tet(A),aph(3'')-Ib,aph(6)-Id,mdtM",No ST predicted,Escherichia coli,Carson City,2021-07-13,Carson - Tahoe Regional Healthcare,2021-07-09,...,,,,,,,,,,


In [17]:
ag2['jared_notes'] = None

In [18]:
id_ = '21-00372063'

ag2.loc[id_, 'jared_notes'] = ag2.loc[id_, 'biosample_accession']
ag2.loc[id_, 'biosample_accession'] = None

### Null biosample

In [19]:
ag2[bs_acc.isnull()]

Unnamed: 0_level_0,entity:miniseq_id,biosample_accession,submission_id,amrfinderplus_amr_genes,ts_mlst_predicted_st,gambit_predicted_taxon,city,received_date,Provider,collection_date,...,lat_lon,library_ID,library_layout,library_selection,library_source,library_strategy,organism,platform,title,jared_notes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
03-98DDCS,03-98DDCS,,,"blaEC,emrD,acrF,mdtM",ST11,Escherichia coli,,NaT,,NaT,...,,,,,,,,,,
0398KL,0398KL,,,"mdtM,blaEC,emrD,acrF",ST11,Escherichia coli,,NaT,,NaT,...,,,,,,,,,,
1658-pos,1658-pos,,,"oqxA,oqxB,fosA,emrD,kdeA,blaSHV-28,blaCTX-M-15...",ST14,Klebsiella pneumoniae,,NaT,,NaT,...,,,,,,,,,,
1736855-UT,1736855-UT,,,"blaA,vat(F)",No ST predicted,Yersinia enterocolitica,,NaT,,NaT,...,,,,,,,,,,
21-00368644B,21-00368644b,,,blaOXA,ST50,Campylobacter jejuni,,NaT,,NaT,...,,,,,,,,,,
21-00372063,21-00372063,,,"acrF,emrD,blaEC,tet(A),aph(3'')-Ib,aph(6)-Id,mdtM",No ST predicted,Escherichia coli,Carson City,2021-07-13,Carson - Tahoe Regional Healthcare,2021-07-09,...,,,,,,,,,,QC Fail
22-00020994,22-00020994,,,No AMR genes detected by NCBI-AMRFinderPlus,No ST predicted,Campylobacter,Phoenix,2022-01-20,Laboratory Corporation of America,2022-01-13,...,not collected,NV_NSPHL_0000449,paired,RANDOM,GENOMIC,WGS,Campylobacter lari,ILLUMINA,WGS of HAIs,
22-00044713,22-00044713,,NV_NSPHL_0000491,No AMR genes detected by NCBI-AMRFinderPlus,No ST predicted,Actinomyces spp,not collected,NaT,,2022-02-27,...,not collected,NV_NSPHL_0000491,paired,RANDOM,GENOMIC,WGS,Actinomyces spp,ILLUMINA,WGS of HAIs,
249842-H2,249842-H2,,NV_NSPHL_0000545,"blaSHV-11,oqxB,oqxA,fosA7,emrD,kdeA,fosA,tet(A...",No ST predicted,Klebsiella pneumoniae,not collected,NaT,,2020-10-23,...,not collected,NV_NSPHL_0000545,paired,RANDOM,GENOMIC,WGS,Klebsiella pneumoniae,ILLUMINA,WGS of HAIs,
C6472DDCS,C6472DDCS,,,"mdsA,mdsB,fosA7,aadA1,sul1,blaTEM-1",ST15,Salmonella enterica,,NaT,,NaT,...,,,,,,,,,,


## Output

In [20]:
ag2.to_csv(outfiles['table'])