In [1]:
from pathlib import Path

In [2]:
import pandas as pd

## Setup

In [3]:
DATESTR = '220420'
NBNAME = DATESTR + '-pt-table-results-comparison'

In [4]:
infiles = dict(
    david_table=Path('data-src/220420-midas-pt-table.csv'),
    my_table=Path('data-processed/220419-set3-files-list/220419-200726-gold-standard-files.csv'),
    gambit=Path('data-src/220420-query-1.0-beta2.csv'),
)

In [5]:
processed_out = Path('data-processed') / NBNAME
processed_out.mkdir(exist_ok=True)

outfiles = dict(
    comp=processed_out / f'{DATESTR}-pt-table-results-comp.csv',
)

## Code

In [6]:
def yesno_to_bool(x):
    if x == 'yes':
        return True
    elif x == 'no':
        return False
    raise ValueError(x)
    

def stripprefix(prefix, s, strict=True):
    if s.startswith(prefix):
        return s[len(prefix):]
    if strict:
        raise ValueError('String does not contain prefix')
    return s
    

def stripsuffix(suffix, s, strict=True):
    if s.endswith(suffix):
        return s[:-len(suffix)]
    if strict:
        raise ValueError('String does not contain suffix')
    return s

## Load data

In [7]:
my_df = pd.read_csv(infiles['my_table'])

### David

In [8]:
david = pd.read_csv(infiles['david_table'])

In [9]:
del david['jared_id needs edit']
david = david.set_index('jared_id')

In [10]:
david.columns = ['wgs_date', 'class', 'short_name', 'fastq_name', 'gambit_dist', 'gambit_sp_thresh', 'has_16s', 'gambit_genus', 'gambit_species', 'reported', 'cap']

In [11]:
david['has_16s'] = david['has_16s'].map(yesno_to_bool)

In [12]:
david.loc[david['gambit_species'] == 'none', 'gambit_species'] = None

In [13]:
david.loc[david['gambit_genus'] == 'none', 'gambit_genus'] = None

In [14]:
david['gambit_predicted'] = [row.gambit_genus if row.gambit_species is None else ' '.join([row.gambit_genus, row.gambit_species]) for _, row in david.iterrows()]

### GAMBIT results

In [15]:
gambit = pd.read_csv(infiles['gambit'])

In [16]:
gambit = gambit.set_index('query.name')

## Inspect

In [17]:
david.groupby('class').size()

class
BACT1_2016    6
BACT1_2017    6
BACT1_2018    6
BACT1_2019    6
BACT1_2020    7
BACT2_2017    7
BACT2_2018    7
BACT2_2019    8
BACT3_2017    7
BACT3_2018    8
BACT3_2019    5
LPX1 2017     3
LPX1_2018     3
LPX1_2019     3
LPX2_2018     3
LPX2_2019     3
dtype: int64

In [18]:
david[david['has_16s']]

Unnamed: 0_level_0,wgs_date,class,short_name,fastq_name,gambit_dist,gambit_sp_thresh,has_16s,gambit_genus,gambit_species,reported,cap,gambit_predicted
jared_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
17AC0006313-1,6/29/17,BACT2_2017,17AC0006313-1_S23.fasta,17AC0006313-1_S23_L001_R1_001.fastq,0.913,0.944,True,Veillonella,,"Veilonella genus with GAMBIT, V. parvula with 16S",Mixed Sample 2: Staphylococcus hominis and Vei...,Veillonella
18AC0018937-BAP,10/25/18,BACT3_2018,18AC0018937-BAP,18AC0018937-BAP_S11_L001_R1_001.fastq,0.942,,True,,,Proteus vulgaris,Proteus sp / Proteus vulgaris,
19AC0011210,7/9/19,BACT2_2019,19AC0011210-W1,19AC0011210_S5_L001_R1_001.fastq,0.992,,True,,,Granulicatella adiacens,Granulicatella sp / Granulicatella adiacens,
19AC0016708,10/17/19,BACT3_2019,19AC0016708,19AC0016708_S8_L001_R1_001.fastq,0.325,0.921,True,Pseudomonas,,Pseudomonas putida group (species reported wit...,Pseudomonas sp/Pseudomonas putida /Pseudomonas...,Pseudomonas


In [19]:
david[(david['gambit_genus'] + ' ' + david['gambit_species']) != david['reported']]

Unnamed: 0_level_0,wgs_date,class,short_name,fastq_name,gambit_dist,gambit_sp_thresh,has_16s,gambit_genus,gambit_species,reported,cap,gambit_predicted
jared_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
17AC0001410,2/25/17,BACT1_2017,17AC0001410.fasta,17AC0001410_S9_L001_R1_001.fastq,0.161,0.347,False,Escherichia,coli,Negative for E coli O157/Aeromonas/Campylobact...,Mixed Sample 1: Exclude E coli O157/Aeromonas/...,Escherichia coli
17AC0001410A,3/4/17,BACT1_2017,17AC0001410A.fasta,17AC0001410A_S12_L001_R1_001.fastq,0.131,0.47,False,Enterococcus,faecalis,Negative for E coli O157/Aeromonas/Campylobact...,Mixed Sample 1: Exclude E coli O157/Aeromonas/...,Enterococcus faecalis
17AC0006313-1,6/29/17,BACT2_2017,17AC0006313-1_S23.fasta,17AC0006313-1_S23_L001_R1_001.fastq,0.913,0.944,True,Veillonella,,"Veilonella genus with GAMBIT, V. parvula with 16S",Mixed Sample 2: Staphylococcus hominis and Vei...,Veillonella
17AC0012453,10/26/17,BACT3_2017,17AC0012453_S1.fasta,17AC0012453_S1_L001_R1_001.fastq,0.886,0.928,False,Corynebacterium,,Corynebacterium sp,Corynebacterium sp /Corynebacterium striatum,Corynebacterium
17AC0012454,10/26/17,BACT3_2017,17AC0012454_S2.fasta,17AC0012454_S2_L001_R1_001.fastq,0.019,0.937,False,Aerococcus,,Aerococcus sp,Aerococcus sp/ Aerococcus viridans,Aerococcus
18AC0002826,2/23/18,BACT1_2018,18AC0002826,18AC0002826_S11_L001_R1_001.fastq,0.016,0.469,False,Streptococcus,agalactiae,Streptococcus agalactiae group B /Finegoldia m...,Streptococcus agalactiae group B /Finegoldia m...,Streptococcus agalactiae
18AC0018937-BAP,10/25/18,BACT3_2018,18AC0018937-BAP,18AC0018937-BAP_S11_L001_R1_001.fastq,0.942,,True,,,Proteus vulgaris,Proteus sp / Proteus vulgaris,
19AC0011210,7/9/19,BACT2_2019,19AC0011210-W1,19AC0011210_S5_L001_R1_001.fastq,0.992,,True,,,Granulicatella adiacens,Granulicatella sp / Granulicatella adiacens,
19AC0016708,10/17/19,BACT3_2019,19AC0016708,19AC0016708_S8_L001_R1_001.fastq,0.325,0.921,True,Pseudomonas,,Pseudomonas putida group (species reported wit...,Pseudomonas sp/Pseudomonas putida /Pseudomonas...,Pseudomonas
19AC0016710,10/17/19,BACT3_2019,19AC0016710,19AC0016710_S10_L001_R1_001.fastq,0.039,0.522,False,Streptococcus,pyogenes,Streptococcus pyogenes,Streptococcus pyogenes,Streptococcus pyogenes


## Compare to latest GAMBIT results

In [20]:
left = david[['gambit_dist', 'gambit_sp_thresh', 'gambit_genus', 'gambit_species', 'reported', 'cap', 'gambit_predicted']]
left.columns = ['pt_' + n for n in left.columns]

In [21]:
right = gambit.copy()
del right['query.path']
right.columns = ['new_' + n for n in right.columns]
right.index.name = 'jared_id'

In [22]:
comp = left.join(right, on='jared_id')

In [23]:
matches = comp['new_predicted.name'] == comp['pt_gambit_predicted']
matches |= pd.isnull(comp['new_predicted.name']) & pd.isnull(comp['pt_gambit_predicted'])

In [24]:
comp[~matches]

Unnamed: 0_level_0,pt_gambit_dist,pt_gambit_sp_thresh,pt_gambit_genus,pt_gambit_species,pt_reported,pt_cap,pt_gambit_predicted,new_predicted.name,new_predicted.rank,new_predicted.ncbi_id,new_predicted.threshold,new_closest.distance,new_closest.description,new_next.name,new_next.rank,new_next.ncbi_id,new_next.threshold
jared_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
16AC1611140BCAP,0.337,0.554,Propionibacterium (Cutibacterium),acnes,Propionibacterium (Cutibacterium) acnes,Propionibacterium,Propionibacterium (Cutibacterium) acnes,Cutibacterium acnes,species,1747.0,0.553531,0.337403,[GCF_000145115.1] Cutibacterium acnes HL030PA1...,,,,


## Write output

In [25]:
comp.to_csv(outfiles['comp'])