# 210730 validation results comparison

In [1]:
from pathlib import Path

In [2]:
import pandas as pd

## Setup

In [3]:
DATESTR = '210730'
NBNAME = DATESTR + '-validation-results-comparison'

In [4]:
infiles = dict(
    old_results=Path('data-input/210730-validation-results-comparison/original-results.csv'),
    new_results=Path('data-processed/210730-validation/210730-1.0b-validation-primary-matches.csv'),
    join=Path('data-input/210730-validation-results-comparison/210730-validation-results-join.csv'),
)

In [5]:
processed_out = Path('data-processed') / NBNAME
if not processed_out.is_dir():
    processed_out.mkdir()

## Load data

In [6]:
old_results = pd.read_csv(infiles['old_results'])

In [7]:
new_results = pd.read_csv(infiles['new_results'])

In [8]:
join_df = pd.read_csv(infiles['join'])

### Duplicate rows in old results

In [9]:
old_results[old_results.duplicated(['query_file'], keep=False)]

Unnamed: 0,Date of WGS run,Class of PT,query_file,top_score,predicted_genus,predicted_species,top_genus,genus_threshold,top_species,species_threshold,top_strain,top_description,top_accession
83,10/17/2019,BACT3_2019,19AC0016708,0.673881,Pseudomonas,,Pseudomonas,0.2,monteilii,0.994141,NBRC 103158,[GCF_000730605.1] Pseudomonas monteilii NBRC 1...,GCF_000730605.1
88,10/24/2019,BACT3_2019,19AC0016708,0.67099,Pseudomonas,,Pseudomonas,0.2,monteilii,0.994141,NBRC 103158,[GCF_000730605.1] Pseudomonas monteilii NBRC 1...,GCF_000730605.1


Not precisely the same, but close enough. Remove 2nd row.

In [10]:
old_results.drop_duplicates('query_file', inplace=True)

### Change scores to distances in old results

In [11]:
for col in ['top_score', 'genus_threshold', 'species_threshold']:
    old_results[col] = 1 - old_results[col]

In [12]:
old_results.rename(columns=dict(top_score='top_distance'), inplace=True)

## Merge

In [13]:
df = pd.merge(
    join_df,
    new_results.rename(columns=lambda c: 'new.' + c),
    how='left',
    left_on='new_file',
    right_on='new.name',
    validate='m:1',
)
del df['new.name']

In [14]:
df = pd.merge(
    df,
    old_results.rename(columns=lambda c: 'old.' + c),
    how='left',
    left_on='old_file',
    right_on='old.query_file',
    validate='m:1',
)
del df['old.query_file']

In [15]:
merge_ok = ~pd.isnull(df['new_file']) & ~pd.isnull(df['old_file'])

## Additional annotations

In [16]:
def set_int_or_none(df, col):
    df[col] = pd.Series([None if pd.isnull(v) else int(v) for v in df[col]], dtype=object)

In [17]:
def insertcol(name, values):
    if name in df.columns:
        df[name] = values
    else:
        i = list(df.columns).index('new.set')
        df.insert(i, name, values)

In [18]:
set_int_or_none(df, 'new.set')
set_int_or_none(df, 'new.predicted.ncbi_id')
set_int_or_none(df, 'new.closest_genome.taxon.ncbi_id')

In [19]:
insertcol(
    'new_prediction_level',
    [
        None if pd.isnull(row['new_file']) else
            'none' if pd.isnull(row['new.predicted.name']) else
            'species' if pd.isnull(row['new.predicted.rank']) else
            row['new.predicted.rank']
        for i, row in df.iterrows()
    ],
)

In [20]:
insertcol(
    'old_prediction_level',
    [
        None if pd.isnull(row['old_file']) else
            'none' if pd.isnull(row['old.predicted_genus']) else
            'genus' if pd.isnull(row['old.predicted_species']) else
            'species'
        for i, row in df.iterrows()
    ],
)

In [21]:
insertcol('notes', None)

## Compare

### Automatic

In [22]:
consistent = []

for i, row in df.iterrows():
    newlevel = row['new_prediction_level']
    oldlevel = row['old_prediction_level']
    
    if newlevel in [None, 'none'] or oldlevel in [None, 'none']:
        consistent.append(None)
        continue
        
    assert newlevel in ['genus', 'species']
    assert oldlevel in ['genus', 'species']
        
    # Custom taxon, needs manual check
    if pd.isnull(row['new.predicted.ncbi_id']):
        consistent.append('unknown')
        continue
        
    if newlevel == 'genus':
        new_genus = row['new.predicted.name']
        value = new_genus == row['old.predicted_genus']
        
    else:
        new_genus, new_species = row['new.predicted.name'].split(' ', 1)
        genus_match = new_genus == row['old.predicted_genus']
        
        if oldlevel == 'genus':
            value = genus_match
        else:
            value = genus_match and new_species == row['old.predicted_species']
        
    consistent.append(value)
    
insertcol('predictions_consistent', consistent)

In [23]:
pd.value_counts(consistent, dropna=False)

True       75
NaN        13
unknown     8
False       4
dtype: int64

### Inspect inconsistencies

In [24]:
df[df['predictions_consistent'] == False]

Unnamed: 0,new_file,old_file,file_match_ok,new_prediction_level,old_prediction_level,notes,predictions_consistent,new.set,new.predicted.name,new.predicted.ncbi_id,...,old.top_distance,old.predicted_genus,old.predicted_species,old.top_genus,old.genus_threshold,old.top_species,old.species_threshold,old.top_strain,old.top_description,old.top_accession
4,16AC1611140BCAP_S11,16AC1611140B-CAP,,species,genus,,False,200817,Cutibacterium acnes,1747,...,0.3293,Propionibacterium,,acnes,,,0.085357,,Propionibacterium acnes,
22,17AC0006313-1_S23_unpaired,17AC0006313-1_S23.fasta,,genus,genus,,False,200817,Veillonella,29465,...,0.912754,"(16S, 99% match V.parvula)",,Veillonella,0.8,parvula,0.607411,,,
68,19AC0002349_S5,19AC0002349,,species,species,,False,200726,Klebsiella aerogenes,548,...,0.043698,Klebsiella,[Enterobacter] aerogenes,Klebsiella,0.8,[Enterobacter] aerogenes,0.371182,170_EAER,[GCF_001055555.1] [Enterobacter] aerogenes (en...,GCF_001055555.1
70,19AC0002349B2_S11,19AC0002349B2,,species,species,,False,200726,Klebsiella aerogenes,548,...,0.044881,Klebsiella,[Enterobacter] aerogenes,Klebsiella,0.8,[Enterobacter] aerogenes,0.371182,170_EAER,[GCF_001055555.1] [Enterobacter] aerogenes (en...,GCF_001055555.1


These are all actually fine.

In [25]:
df.loc[4, 'notes'] = '"Propionibacterium acnes" is homotypic genbank synonym of "Cutibacterium acnes"'

In [26]:
df.loc[[4, 22, 68, 70], 'predictions_consistent'] = True

### Manually check instances where new match is to subgroup taxon

In [27]:
df[df['predictions_consistent'] == 'unknown']

Unnamed: 0,new_file,old_file,file_match_ok,new_prediction_level,old_prediction_level,notes,predictions_consistent,new.set,new.predicted.name,new.predicted.ncbi_id,...,old.top_distance,old.predicted_genus,old.predicted_species,old.top_genus,old.genus_threshold,old.top_species,old.species_threshold,old.top_strain,old.top_description,old.top_accession
1,16AC1611138-CAP_S13,16AC1611138-CAP,,species,species,,unknown,200726,Escherichia coli subgroup 2,,...,0.1591,Escherichia,coli,Escherichia,0.8,coli,0.3463,,Escherichia coli KTE77,
8,17AC0001410_S9,17AC0001410.fasta,,species,species,,unknown,200726,Escherichia coli subgroup 2,,...,0.15992,Escherichia,coli,Escherichia,0.8,coli,0.346256,KTE77,[GCF_000351725.1] Escherichia coli KTE77 (E. c...,GCF_000351725.1
18,17AC0006310_S6,17AC0006310.fasta,,species,species,,unknown,200726,Bacillus cereus subgroup 1,,...,0.104813,Bacillus,cereus,Bacillus,0.8,cereus,0.314048,B4116,[GCF_001619385.1] Bacillus cereus (firmicutes),GCF_001619385.1
40,18AC0007008CHO_S3,18AC0007008CHO_S3s.fasta,,species,species,,unknown,200726,Escherichia coli subgroup 2,,...,0.159698,Escherichia,coli,Escherichia,0.8,coli,0.346256,KTE77,[GCF_000351725.1] Escherichia coli KTE77 (E. c...,GCF_000351725.1
51,18AC0012153M_S10,18AC0012153M,,species,species,,unknown,200817,Escherichia coli subgroup 2,,...,0.161072,Escherichia,coli,Escherichia,0.8,coli,0.346256,KTE77,[GCF_000351725.1] Escherichia coli KTE77 (E. c...,GCF_000351725.1
52,18AC0012154-B_S7,18AC0012154-B,,species,species,,unknown,200817,Escherichia coli subgroup 2,,...,0.160598,Escherichia,coli,Escherichia,0.8,coli,0.346256,KTE77,[GCF_000351725.1] Escherichia coli KTE77 (E. c...,GCF_000351725.1
65,18AC0018938-1_S14,18AC0018938-1,,species,species,,unknown,200726,Escherichia coli subgroup 1,,...,0.10866,Escherichia,coli,Escherichia,0.8,coli,0.346256,TOP2396-3,[GCF_000397565.1] Escherichia coli TOP2396-3 (...,GCF_000397565.1
83,19AC0011213_S10,19AC0011213,,species,species,,unknown,200726,Escherichia coli subgroup 2,,...,0.167595,Escherichia,coli,Escherichia,0.8,coli,0.346256,KTE77,[GCF_000351725.1] Escherichia coli KTE77 (E. c...,GCF_000351725.1


All good.

In [28]:
df.loc[df['predictions_consistent'] == 'unknown', 'predictions_consistent'] = True

### Check closest genome identical

Seems to be the case in many instances.

In [29]:
new_accession = df['new.closest_genome.description'].str.extract(r'\[(GCF_\d+\.\d)\].*', expand=False)

In [30]:
insertcol('closest_genome_identical', (new_accession == df['old.top_accession']).astype(object))

In [31]:
df.loc[~merge_ok, 'closest_genome_identical'] = None

In [32]:
df['closest_genome_identical'].value_counts(dropna=False)

True     78
False    15
NaN       7
Name: closest_genome_identical, dtype: int64

### Check closest genome species matches

Only need to check cases where predictions are not both at the species level.

In [33]:
df.loc[
    df['new_prediction_level'].isin(['genus', 'none']) & df['old_prediction_level'].isin(['genus', 'none']),
    [
        'new_file',
        'old_file',
        'closest_genome_identical',
        'new.predicted.name',
        'old.predicted_genus',
        'old.predicted_species',
        'new.closest_genome.distance',
        'new.closest_genome.description',
        'new.closest_genome.taxon.name',
        'new.closest_genome.taxon.ncbi_id',
        'old.top_distance',
        'old.top_genus',
        'old.top_species',
        'old.top_description',
    ]
]

Unnamed: 0,new_file,old_file,closest_genome_identical,new.predicted.name,old.predicted_genus,old.predicted_species,new.closest_genome.distance,new.closest_genome.description,new.closest_genome.taxon.name,new.closest_genome.taxon.ncbi_id,old.top_distance,old.top_genus,old.top_species,old.top_description
22,17AC0006313-1_S23_unpaired,17AC0006313-1_S23.fasta,False,Veillonella,"(16S, 99% match V.parvula)",,0.912588,[GCF_000024945.1] Veillonella parvula DSM 2008...,Veillonella parvula,29466,0.912754,Veillonella,parvula,
24,17AC0006313ANBLD_S10,17AC0006313ANBLD.fasta,True,Veillonella,,,0.912617,[GCF_000024945.1] Veillonella parvula DSM 2008...,Veillonella parvula,29466,0.912658,Veillonella,parvula,[GCF_000024945.1] Veillonella parvula DSM 2008...
26,17AC0012453_S1_unpaired,17AC0012453_S1.fasta,True,Corynebacterium,,,0.886251,[GCF_001586215.1] Corynebacterium simulans (hi...,Corynebacterium simulans,146827,0.886297,Corynebacterium,simulans,[GCF_001586215.1] Corynebacterium simulans (hi...
27,17AC0012454_S2_unpaired,17AC0012454_S2.fasta,True,Aerococcus,Aerococcus,,0.0189,[GCF_000178435.1] Aerococcus viridans ATCC 115...,Aerococcus viridans,1377,0.022655,Aerococcus,viridans,[GCF_000178435.1] Aerococcus viridans ATCC 115...
59,18AC0018936-1_S12,18AC0018936-1,True,,,,0.939734,[GCF_000276305.1] Staphylococcus epidermidis N...,Staphylococcus epidermidis,1282,0.947987,Staphylococcus,epidermidis,[GCF_000276305.1] Staphylococcus epidermidis N...
64,18AC0018937-BAP_S11,18AC0018937-BAP,True,,,,0.941886,[GCF_000784015.1] Proteus mirabilis (enterobac...,Proteus mirabilis,584,0.941817,Proteus,mirabilis,[GCF_000784015.1] Proteus mirabilis (enterobac...
69,19AC0002349B1_S10,19AC0002349B1,False,,,,0.991803,[GCF_000009685.1] Clostridium perfringens str....,Clostridium perfringens,1502,0.991757,Clostridium,baratii,[GCF_000789395.1] Clostridium baratii str. Sul...
78,19AC0011210_S5,19AC0011210-W1,True,,,,0.991926,[GCF_000169595.1] Ureaplasma urealyticum serov...,Ureaplasma urealyticum,2130,0.991939,Ureaplasma,urealyticum,[GCF_000169595.1] Ureaplasma urealyticum serov...
87,19AC0016708_S8,19AC0016708,True,Pseudomonas,Pseudomonas,,0.325084,[GCF_000730605.1] Pseudomonas monteilii NBRC 1...,Pseudomonas monteilii,76759,0.326119,Pseudomonas,monteilii,[GCF_000730605.1] Pseudomonas monteilii NBRC 1...


Looks like only 69 differs, Clostridium perfringens in new vs baratii in old.

In [34]:
insertcol('closest_genome_species_identical', None)
df.loc[merge_ok, 'closest_genome_species_identical'] = True
df.loc[69, 'closest_genome_species_identical'] = False

## Stats

In [35]:
df.groupby(['file_match_ok'], dropna=False).size()

file_match_ok
?            6
new_only     5
old_only     2
NaN         87
dtype: int64

In [36]:
df.groupby(['old_prediction_level', 'new_prediction_level']).size()

old_prediction_level  new_prediction_level
genus                 genus                    3
                      species                  7
none                  genus                    2
                      none                     4
species               species                 77
dtype: int64

In [37]:
df.loc[merge_ok, 'predictions_consistent'].value_counts(dropna=False)

True    87
NaN      6
Name: predictions_consistent, dtype: int64

In [38]:
df.loc[merge_ok, 'closest_genome_identical'].value_counts(dropna=False)

True     78
False    15
Name: closest_genome_identical, dtype: int64

In [39]:
df.loc[merge_ok, 'closest_genome_species_identical'].value_counts(dropna=False)

True     92
False     1
Name: closest_genome_species_identical, dtype: int64

## Save

In [40]:
df.to_csv(processed_out / f'{DATESTR}-1.0b-validation-results-comparison.csv')