# 201122 taxon name matching

In [None]:
exptname = '201031-database-v1.1-software-version-migration'
datestr = '201122'
nbname = datestr + '-taxon-name-matching'

In [None]:
import json
from pathlib import Path
from zipfile import ZipFile
from gzip import GzipFile
import re
from collections import Counter

In [3]:
import numpy as np
import pandas as pd

## File paths

In [4]:
infiles = dict(
    v11_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
    taxa=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201102-download-taxa/'),
    taxonomy_additional=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201109-extract-additional-taxonomy-data/'),
    taxonomy_original=Path('../../data/intermediate/201031-database-v1.1-software-version-migration/201113-original-genome-taxa/'),
)

In [5]:
processed_out = Path('../../data/processed/') / exptname / nbname
processed_out.mkdir(exist_ok=True, parents=True)

In [6]:
intermediate_out = Path('../../data/intermediate/') / exptname / nbname
intermediate_out.mkdir(exist_ok=True, parents=True)

## Load data

### Archive files

In [7]:
archive_v11 = ZipFile(GzipFile(infiles['v11_archive']))
archive_v11.read('info').decode()

'{"archive_version": "1.0"}'

In [8]:
with archive_v11.open('genome_sets/midas/assembly/curated') as f:
    gset_data = json.load(f)

In [9]:
genus_names = set()
species_names = set()

for adata in gset_data['annotations'].values():
    genus_names.add(adata['tax_genus'])
    species_names.add((adata['tax_genus'], adata['tax_species']))
    
species_names = sorted(species_names)
genus_names = sorted(genus_names)
    
len(genus_names), len(species_names)

(419, 1438)

### Taxonomy

In [10]:
with open(infiles['taxa'] / 'taxa.json') as f:
    taxon_data = json.load(f)
    
# Convert to dict indexed by ID
taxon_data = {tdata['taxid']: tdata for tdata in taxon_data}

In [11]:
with open(infiles['taxa'] / 'aka_taxids.json') as f:
    aka_taxids = json.load(f)
    
aka_taxids = {int(id1): id2 for id1, id2 in aka_taxids.items()}

In [12]:
# Check taxon_data dict doesn't include any alias taxonomy IDs
for taxid, taxon in taxon_data.items():
    assert taxon['taxid'] == taxid
    assert taxid not in aka_taxids
    assert taxon['parent_taxid'] == 0 or taxon['parent_taxid'] in taxon_data

In [13]:
with open(infiles['taxonomy_additional'] / 'taxon-othernames.json') as f:
    taxon_othernames = json.load(f)
    
taxon_othernames = {int(taxid): v for taxid, v in taxon_othernames.items()}

In [14]:
with open(infiles['taxonomy_original'] / 'species-genome-lcas.json') as f:
    species_genome_lcas = json.load(f)

species_genome_lcas = {(d['curated_genus'], d['curated_species']): d['taxid'] for d in species_genome_lcas}

In [15]:
with open(infiles['taxonomy_original'] / 'genome-matching-taxids-by-species.json') as f:
    species_genome_taxids = json.load(f)

species_genome_taxids = {(d['curated_genus'], d['curated_species']): set(d['taxids']) for d in species_genome_taxids}

## Func defs

In [16]:
def only(it):
    """Get the only element of an iterable if it has length one, else raise an error."""
    (item,) = it
    return item

In [17]:
def resolve_alias(tid):
    return aka_taxids.get(tid, tid)

In [18]:
def gettaxon(tid):
    """Get taxon by ID, resolving alias IDs."""
    return taxon_data[resolve_alias(tid)]

In [19]:
def getparent(taxon):
    """Get taxon's parent, handling aliases of parent id."""
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
        
    try:
        return gettaxon(taxon['parent_taxid'])
    except KeyError:
        return None

In [20]:
def iter_ancestors(taxon, incself=False):
    if isinstance(taxon, int):
        taxon = gettaxon(taxon)
    if not incself:
        taxon = getparent(taxon)
        
    while taxon is not None:
        yield taxon
        taxon = getparent(taxon)

In [21]:
def normalize_name(name):
    # Join (genus, species) tuples
    if not isinstance(name, str):
        name = ' '.join(name)
        
    name = name.strip()
    name = name.lower()
    
    # Remove square brackets
    name = re.sub(r'[\[\]]', '', name)
    
    if name.startswith('candidatus '):
        name = name.split(' ', 1)[1]

    return name

def name_eq(name1, name2):
    return normalize_name(name1) == normalize_name(name2)

In [22]:
def iter_names(taxon):
    yield (taxon['scientific_name'], 'primary')
    for namedict in taxon_othernames.get(taxon['taxid'], []):
        yield (namedict['name'], namedict['type'])

In [23]:
def taxon_url(taxid):
    return 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=%d' % taxid

## Non-alphabetic genus and species names

Expect curated genus/species names in previous database version to contain only alphabetic characters, check for instances where this does not hold true:

In [24]:
weird_genus_names = [name for name in genus_names if re.fullmatch(r'[A-Z][a-z]+', name) is None]

for name in weird_genus_names:
    print(name)

(No genus names with unexpected format)

In [25]:
weird_species_names = [(g, s) for g, s in species_names if re.fullmatch(r'[a-z]+', s, re.I) is None]

for g, s in weird_species_names:
    print('%25s\t%s' % (g, s))

            Aphanizomenon	flos-aquae
                  Blautia	[Ruminococcus] gnavus
             Caballeronia	Paraburkholderia zhejiangensis
   Erysipelatoclostridium	[Clostridium] innocuum
              Haemophilus	[Haemophilus] parasuis
               Klebsiella	[Enterobacter] aerogenes
        Lachnoclostridium	[Clostridium] bolteae
        Lachnoclostridium	[Clostridium] citroniae
        Lachnoclostridium	[Clostridium] clostridioforme
        Lachnoclostridium	[Clostridium] symbiosum
              Pasteurella	[Pasteurella] pneumotropica
              Pseudomonas	syringae group genomosp. 7
        Ruminiclostridium	[Clostridium] clariflavum
        Ruminiclostridium	[Clostridium] josui
        Ruminiclostridium	[Clostridium] stercorarium
        Ruminiclostridium	[Eubacterium] siraeum
                Wolbachia	endosymbiont of Culex quinquefasciatus
                Wolbachia	endosymbiont of Drosophila melanogaster


These species names may cause issues, inspect results for these individually later.

## Find ancestor of species genome LCAs matching species name

The LCA of the genomes for a specific species should be within the correct taxon in most cases, but expect that it may be more specific than desired (e.g. a subspecies or strain). Iterate through the ancestors of each LCA and find all matches between ancestor taxon names (including alias names) and the species name.

In [26]:
species_name_matches = {}

for db_species, taxid in species_genome_lcas.items():
    taxon = gettaxon(taxid)
        
    matches = set()
    
    for species_only in (False, True):
        spname = db_species[1] if species_only else db_species
        for a in iter_ancestors(taxon, incself=True):
            for aname, name_type in iter_names(a):
                if name_eq(spname, aname):
                    matches.add((a['taxid'], aname, name_type, species_only))
        
    species_name_matches[db_species] = matches

In [27]:
Counter(map(len, species_name_matches.values()))

Counter({1: 1387, 0: 19, 2: 31, 3: 1})

Nearly all species have a single unique match, a few have no match or more than one.

### Check species with more than one match

In [28]:
for sp, matches in species_name_matches.items():
    if len(matches) > 1:
        print(repr(sp))
        for match in matches:
            print('\t', repr(match), sep='')

('Blautia', '[Ruminococcus] gnavus')
	(33038, 'Ruminococcus gnavus', 'Synonym', True)
	(33038, '[Ruminococcus] gnavus', 'primary', True)
('Erysipelatoclostridium', '[Clostridium] innocuum')
	(1522, 'Clostridium innocuum', 'Synonym', True)
	(1522, '[Clostridium] innocuum', 'primary', True)
('Filomicrobium', 'marinum')
	(1608628, 'Candidatus Filomicrobium marinum', 'primary', False)
	(1608628, 'Filomicrobium marinum', 'EquivalentName', False)
('Hamiltonella', 'defensa')
	(138072, 'Candidatus Hamiltonella defensa', 'primary', False)
	(138072, 'Hamiltonella defensa', 'Synonym', False)
('Helicobacter', 'heilmannii')
	(35817, 'Candidatus Helicobacter heilmannii', 'Synonym', False)
	(35817, 'Helicobacter heilmannii', 'primary', False)
('Helicobacter', 'suis')
	(104628, 'Helicobacter suis', 'primary', False)
	(104628, 'Candidatus Helicobacter suis', 'Synonym', False)
('Kinetoplastibacterium', 'blastocrithidii')
	(233181, 'Kinetoplastibacterium blastocrithidii', 'EquivalentName', False)
	(23318

All species with multiple matches have only one matched taxon, so there are no conflicts.

### Check species with no matches

In [29]:
for sp, matches in species_name_matches.items():
    if not matches:
        taxid = species_genome_lcas[sp]
        taxon = gettaxon(taxid)
        
        print(repr(sp))
        print('\t' + taxon['scientific_name'])
        print('\t' + taxon['rank'])

('Actinomyces', 'odontolyticus')
	Schaalia odontolytica
	species
('Azospirillum', 'brasilense')
	Azospirillum
	genus
('Francisella', 'noatunensis')
	Francisella orientalis
	species
('Lachnoclostridium', '[Clostridium] clostridioforme')
	Enterocloster clostridioformis
	species
('Mobiluncus', 'curtisii')
	Mobiluncus
	genus
('Mycobacterium', 'intracellulare')
	Mycobacterium avium complex (MAC)
	species group
('Pectobacterium', 'carotovorum')
	Pectobacterium
	genus
('Pectobacterium', 'wasabiae')
	Pectobacterium
	genus
('Photorhabdus', 'luminescens')
	Photorhabdus
	genus
('Photorhabdus', 'temperata')
	Photorhabdus
	genus
('Pseudoalteromonas', 'haloplanktis')
	Pseudoalteromonas
	genus
('Pseudomonas', 'pseudoalcaligenes')
	Pseudomonas oleovorans
	species
('Salinispora', 'pacifica')
	Salinispora
	genus
('Vibrio', 'alginolyticus')
	Vibrio harveyi group
	species group
('Vibrio', 'tasmaniensis')
	Vibrio
	genus
('Xanthomonas', 'alfalfae')
	Xanthomonas euvesicatoria
	species
('Xanthomonas', 'axonop

These probably require manual curation, will tackle this in another notebook.

### Pick at most one match for each species

We've established that in the case of multiple matches for a given species all point to the same taxon, so this is really just to select the matched alternate name for reporting purposes.

In [30]:
def match_priority(match):
    """Priority to pick best species name match.
    
    Prioritize first by match to primary name of taxon,
    then by matching to curated genus+species name
    """
    taxid, name, name_type, species_only = match
    return (name_type == 'primary', not species_only)

In [31]:
species_single_name_matches = dict()

for sp, matches in species_name_matches.items():
    if len(matches) == 1:
        species_single_name_matches[sp] = only(matches)
    elif len(matches) == 0:
        species_single_name_matches[sp] = None
    else:
        # Assert all same taxon
        assert len(set(taxid for taxid, *rest in matches)) == 1
        
        matches = sorted(matches, key=match_priority, reverse=True)
        
        # Print warning if one match 
        if match_priority(matches[0]) == match_priority(matches[1]):
            print('Ambiguous primary match for species %r:' % (sp,))
            for match in matches:
                if match_priority(match) == match_priority(matches[0]):
                    print('\t', match, sep='')
                
            # Use name length as tiebreaker
            matches.sort(key=lambda m: (match_priority(m), len(m[1])))
            
        species_single_name_matches[sp] = matches[0]

Ambiguous primary match for species ('Rickettsia', 'amblyommii'):
	(33989, 'Candidatus Rickettsia amblyommii', 'Synonym', False)
	(33989, 'Rickettsia amblyommii', 'Synonym', False)


### Mapped taxon collisions

Instances where multiple curated species names were mapped to the same taxon:

In [32]:
species_match_collisions = dict()

for sp, matches in species_single_name_matches.items():
    if matches is None:
        continue
    taxid, *_rest = matches
    species_match_collisions.setdefault(taxid, set()).add(sp)
    
for taxid in list(species_match_collisions):
    if len(species_match_collisions[taxid]) == 1:
        del species_match_collisions[taxid]
    
len(species_match_collisions)

3

## Summarize in table

In [33]:
species_genome_counts = Counter((a['tax_genus'], a['tax_species']) for a in gset_data['annotations'].values())

In [34]:
_rows = []

for sp in species_names:
    lca_taxid = species_genome_lcas[sp]
    lca_taxon = gettaxon(lca_taxid)
        
    row = (*sp, sp not in weird_species_names, species_genome_counts[sp], len(species_genome_taxids[sp]))
    row += tuple(lca_taxon[k] for k in ['taxid', 'scientific_name', 'rank'])
    
    match = species_single_name_matches[sp]
    if match is None:
        row += (None,) * 7
    else:
        match_taxid, match_name, match_nametype, match_speciesonly = match
        match_taxon = gettaxon(match_taxid)
        
        row += (match_name, match_nametype, match_speciesonly)
        row += tuple(match_taxon[k] for k in ['taxid', 'scientific_name', 'rank'])
        row += (match_taxid not in species_match_collisions,)
    
    _rows.append(row)
    
df = pd.DataFrame.from_records(
    _rows,
    columns=['curated_genus', 'curated_species', 'speciesname_standard_format', 'species_ngenomes', 'original_taxon_count',
             'lca_taxid', 'lca_taxon_name', 'lca_taxon_rank',
             'matched_name', 'matched_nametype', 'matched_speciesname_only',
             'matched_taxid', 'matched_taxon_name', 'matched_taxon_rank',
             'matched_taxon_unique'],
).set_index(['curated_genus', 'curated_species'])

# Pandas coerces mix of int and None to floats, reverse it
df['matched_taxid'] = np.asarray([None if pd.isnull(x) else int(x) for x in df['matched_taxid']], dtype=object)

## Inspect results

### Matched taxon not of species rank

In [35]:
df[~df['matched_taxid'].isnull() & (df['matched_taxon_rank'] != 'species')]

Unnamed: 0_level_0,Unnamed: 1_level_0,speciesname_standard_format,species_ngenomes,original_taxon_count,lca_taxid,lca_taxon_name,lca_taxon_rank,matched_name,matched_nametype,matched_speciesname_only,matched_taxid,matched_taxon_name,matched_taxon_rank,matched_taxon_unique
curated_genus,curated_species,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Bifidobacterium,kashiwanohense,True,3,1,1150460,Bifidobacterium catenulatum subsp. kashiwanohe...,strain,Bifidobacterium kashiwanohense,Synonym,False,630129,Bifidobacterium catenulatum subsp. kashiwanohense,subspecies,True
Enterobacter,xiangfangensis,True,10,1,1296536,Enterobacter hormaechei subsp. xiangfangensis,subspecies,Enterobacter xiangfangensis,Synonym,False,1296536,Enterobacter hormaechei subsp. xiangfangensis,subspecies,True
Mycobacterium,africanum,True,22,19,33894,Mycobacterium tuberculosis variant africanum,biotype,Mycobacterium africanum,GenbankSynonym,False,33894,Mycobacterium tuberculosis variant africanum,biotype,True
Mycobacterium,bovis,True,66,19,1765,Mycobacterium tuberculosis variant bovis,biotype,Mycobacterium bovis,GenbankSynonym,False,1765,Mycobacterium tuberculosis variant bovis,biotype,True
Xanthomonas,gardneri,True,11,2,2754056,Xanthomonas hortorum pv. gardneri,no rank,Xanthomonas gardneri,Synonym,False,2754056,Xanthomonas hortorum pv. gardneri,no rank,True


In [36]:
_.shape[0]

5

These seem like valid matches, curated species names corresponding to these will probably have to be merged into the parents of these taxa when finishing the database migration.

### Matched on species name only

In [37]:
df[df['matched_speciesname_only'] == True]

Unnamed: 0_level_0,Unnamed: 1_level_0,speciesname_standard_format,species_ngenomes,original_taxon_count,lca_taxid,lca_taxon_name,lca_taxon_rank,matched_name,matched_nametype,matched_speciesname_only,matched_taxid,matched_taxon_name,matched_taxon_rank,matched_taxon_unique
curated_genus,curated_species,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Blautia,[Ruminococcus] gnavus,False,3,3,33038,[Ruminococcus] gnavus,species,[Ruminococcus] gnavus,primary,True,33038,[Ruminococcus] gnavus,species,True
Caballeronia,Paraburkholderia zhejiangensis,False,2,1,871203,Caballeronia zhejiangensis,species,Paraburkholderia zhejiangensis,Synonym,True,871203,Caballeronia zhejiangensis,species,True
Erysipelatoclostridium,[Clostridium] innocuum,False,2,2,1522,[Clostridium] innocuum,species,[Clostridium] innocuum,primary,True,1522,[Clostridium] innocuum,species,True
Haemophilus,[Haemophilus] parasuis,False,24,19,738,Glaesserella parasuis,species,Haemophilus parasuis,Synonym,True,738,Glaesserella parasuis,species,True
Klebsiella,[Enterobacter] aerogenes,False,108,16,548,Klebsiella aerogenes,species,Enterobacter aerogenes,Synonym,True,548,Klebsiella aerogenes,species,True
Lachnoclostridium,[Clostridium] bolteae,False,7,7,208479,Enterocloster bolteae,species,Clostridium bolteae,Synonym,True,208479,Enterocloster bolteae,species,True
Lachnoclostridium,[Clostridium] citroniae,False,2,2,358743,Enterocloster citroniae,species,Clostridium citroniae,Synonym,True,358743,Enterocloster citroniae,species,True
Lachnoclostridium,[Clostridium] symbiosum,False,3,3,1512,[Clostridium] symbiosum,species,[Clostridium] symbiosum,primary,True,1512,[Clostridium] symbiosum,species,True
Pasteurella,[Pasteurella] pneumotropica,False,2,2,758,Rodentibacter pneumotropicus,species,Pasteurella pneumotropica,Synonym,True,758,Rodentibacter pneumotropicus,species,True
Ruminiclostridium,[Clostridium] clariflavum,False,2,2,288965,Hungateiclostridium clariflavum,species,Clostridium clariflavum,Synonym,True,288965,Hungateiclostridium clariflavum,species,True


In [38]:
_.shape[0]

13

### Matched taxon is not LCA taxon

In [39]:
df[~df['matched_taxid'].isnull() & (df['lca_taxid'] != df['matched_taxid'])]

Unnamed: 0_level_0,Unnamed: 1_level_0,speciesname_standard_format,species_ngenomes,original_taxon_count,lca_taxid,lca_taxon_name,lca_taxon_rank,matched_name,matched_nametype,matched_speciesname_only,matched_taxid,matched_taxon_name,matched_taxon_rank,matched_taxon_unique
curated_genus,curated_species,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Acidiphilium,multivorum,True,2,1,926570,Acidiphilium multivorum AIU301,strain,Acidiphilium multivorum,primary,False,62140,Acidiphilium multivorum,species,True
Aeromonas,diversa,True,2,1,1268237,Aeromonas diversa CDC 2478-85,strain,Aeromonas diversa,primary,False,502790,Aeromonas diversa,species,True
Amycolatopsis,methanolica,True,2,1,1068978,Amycolatopsis methanolica 239,strain,Amycolatopsis methanolica,primary,False,1814,Amycolatopsis methanolica,species,True
Arenimonas,composti,True,2,1,1121013,Arenimonas composti TR7-09 = DSM 18010,strain,Arenimonas composti,primary,False,370776,Arenimonas composti,species,True
Arenimonas,oryziterrae,True,2,1,1121015,Arenimonas oryziterrae DSM 21050 = YC6267,strain,Arenimonas oryziterrae,primary,False,498055,Arenimonas oryziterrae,species,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vibrio,natriegens,True,3,1,1219067,Vibrio natriegens NBRC 15636 = ATCC 14048 = DS...,strain,Vibrio natriegens,primary,False,691,Vibrio natriegens,species,True
Vibrio,orientalis,True,2,1,675816,Vibrio orientalis CIP 102891 = ATCC 33934,strain,Vibrio orientalis,primary,False,28175,Vibrio orientalis,species,True
Waddlia,chondrophila,True,2,1,716544,Waddlia chondrophila WSU 86-1044,strain,Waddlia chondrophila,primary,False,71667,Waddlia chondrophila,species,True
Wenxinia,marina,True,2,1,1123501,Wenxinia marina DSM 24838,strain,Wenxinia marina,primary,False,390641,Wenxinia marina,species,True


In [40]:
_.shape[0]

213

### Species names with non-standard format

In [41]:
df[~df['speciesname_standard_format']]

Unnamed: 0_level_0,Unnamed: 1_level_0,speciesname_standard_format,species_ngenomes,original_taxon_count,lca_taxid,lca_taxon_name,lca_taxon_rank,matched_name,matched_nametype,matched_speciesname_only,matched_taxid,matched_taxon_name,matched_taxon_rank,matched_taxon_unique
curated_genus,curated_species,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Aphanizomenon,flos-aquae,False,2,2,1176,Aphanizomenon flos-aquae,species,Aphanizomenon flos-aquae,primary,False,1176.0,Aphanizomenon flos-aquae,species,True
Blautia,[Ruminococcus] gnavus,False,3,3,33038,[Ruminococcus] gnavus,species,[Ruminococcus] gnavus,primary,True,33038.0,[Ruminococcus] gnavus,species,True
Caballeronia,Paraburkholderia zhejiangensis,False,2,1,871203,Caballeronia zhejiangensis,species,Paraburkholderia zhejiangensis,Synonym,True,871203.0,Caballeronia zhejiangensis,species,True
Erysipelatoclostridium,[Clostridium] innocuum,False,2,2,1522,[Clostridium] innocuum,species,[Clostridium] innocuum,primary,True,1522.0,[Clostridium] innocuum,species,True
Haemophilus,[Haemophilus] parasuis,False,24,19,738,Glaesserella parasuis,species,Haemophilus parasuis,Synonym,True,738.0,Glaesserella parasuis,species,True
Klebsiella,[Enterobacter] aerogenes,False,108,16,548,Klebsiella aerogenes,species,Enterobacter aerogenes,Synonym,True,548.0,Klebsiella aerogenes,species,True
Lachnoclostridium,[Clostridium] bolteae,False,7,7,208479,Enterocloster bolteae,species,Clostridium bolteae,Synonym,True,208479.0,Enterocloster bolteae,species,True
Lachnoclostridium,[Clostridium] citroniae,False,2,2,358743,Enterocloster citroniae,species,Clostridium citroniae,Synonym,True,358743.0,Enterocloster citroniae,species,True
Lachnoclostridium,[Clostridium] clostridioforme,False,9,9,1531,Enterocloster clostridioformis,species,,,,,,,
Lachnoclostridium,[Clostridium] symbiosum,False,3,3,1512,[Clostridium] symbiosum,species,[Clostridium] symbiosum,primary,True,1512.0,[Clostridium] symbiosum,species,True


In [42]:
_.shape[0]

18

### Non-unique matches

In [43]:
df[df['matched_taxon_unique'] == False]

Unnamed: 0_level_0,Unnamed: 1_level_0,speciesname_standard_format,species_ngenomes,original_taxon_count,lca_taxid,lca_taxon_name,lca_taxon_rank,matched_name,matched_nametype,matched_speciesname_only,matched_taxid,matched_taxon_name,matched_taxon_rank,matched_taxon_unique
curated_genus,curated_species,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Bacillus,mycoides,True,9,4,1405,Bacillus mycoides,species,Bacillus mycoides,primary,False,1405,Bacillus mycoides,species,False
Bacillus,weihenstephanensis,True,3,2,1405,Bacillus mycoides,species,Bacillus weihenstephanensis,GenbankSynonym,False,1405,Bacillus mycoides,species,False
Streptomyces,californicus,True,3,1,67351,Streptomyces californicus,species,Streptomyces californicus,primary,False,67351,Streptomyces californicus,species,False
Streptomyces,puniceus,True,2,1,67351,Streptomyces californicus,species,Streptomyces puniceus,Synonym,False,67351,Streptomyces californicus,species,False
Vibrio,albensis,True,3,3,666,Vibrio cholerae,species,Vibrio albensis,Synonym,False,666,Vibrio cholerae,species,False
Vibrio,cholerae,True,437,191,666,Vibrio cholerae,species,Vibrio cholerae,primary,False,666,Vibrio cholerae,species,False


These look like instances where one species was merged into another.

## Write output

### Matches to JSON

In [44]:
df.to_csv(processed_out / (datestr + '-db-v1.1-taxon-name-matching-summary.csv'))

In [45]:
_matches_json = []

for (g, s), match in species_single_name_matches.items():
    entry = dict(curated_genus=g, curated_species=s)
    
    if match is not None:
        entry['matched_taxid'], entry['matched_name'], entry['nametype'], entry['species_name_only'] = match
    else:
        entry['matched_taxid'] = entry['matched_name'] = entry['nametype'] = entry['species_name_only'] = None
        
    _matches_json.append(entry)

with open(intermediate_out / 'species-name-matches.json', 'w') as f:
    json.dump(_matches_json, f)