In [1]:
from pathlib import Path
from gzip import GzipFile
import re

In [2]:
from tqdm import tqdm

In [3]:
from gambit_legacy.midas1.archive import Midas1Archive

## Setup

In [4]:
keys = [
    'refseq_1.0_160823',
    'refseq_1.0_160906',
    'refseq_1.1_beta_200521',
]

archive_files = [
    Path('/home/jared/projects/gambit/data/databases/midas/v1/archives/' + k + '.midas-archive.gz')
    for k in keys
]

## Load data

In [5]:
gsets = []
gset_genomes = []

for path in tqdm(archive_files):
    gz = GzipFile(path)

    archive = Midas1Archive(gz)

    (gset_key,) = archive.list_genome_sets()

    gset, genomes = archive.get_genome_set(gset_key)
    
    gsets.append(gset)
    gset_genomes.append(genomes)

100%|██████████| 3/3 [00:24<00:00,  8.14s/it]


In [6]:
gsets

[{'description': 'All bacterial RefSeq entries in assembly database',
  'key_version': '1.0',
  'key': 'refseq/assembly/all',
  'name': 'refseq',
  'meta': None},
 {'meta': None,
  'name': 'refseq',
  'key': 'refseq/assembly/all',
  'key_version': '1.0',
  'description': 'All bacterial RefSeq entries in assembly database'},
 {'key_version': '1.1',
  'description': 'All bacterial RefSeq entries in assembly database, updated with newest ESummary data 2020-05-20.',
  'meta': {'parent': {'key_version': '1.0', 'key': 'refseq/assembly/all'},
   'date_created': '2020-05-20'},
  'key': 'refseq/assembly/all',
  'name': 'refseq 1.1'}]

In [7]:
{k: len(g) for k, g in zip(keys, gset_genomes)}

{'refseq_1.0_160823': 65563,
 'refseq_1.0_160906': 65561,
 'refseq_1.1_beta_200521': 65556}

## Compare

## 160823 $\to$ 160906

In [8]:
gsets[0] == gsets[1]

True

In [9]:
assert set(gset_genomes[0]) >= set(gset_genomes[1])

In [10]:
missing_160906 = set(gset_genomes[0]) - set(gset_genomes[1])
missing_160906

{'refseq/assembly/GCF_900021165.1', 'refseq/assembly/GCF_900069005.1'}

In [11]:
for k, g in gset_genomes[1].items():
    assert g == gset_genomes[0][k]

In [12]:
{k: gset_genomes[0][k] for k in missing_160906}

{'refseq/assembly/GCF_900069005.1': {'tax_strain': 'GM2',
  'tax_species': 'sp. GM2',
  'organism': 'Paenibacillus sp. GM2 (firmicutes)',
  'tax_genus': 'Paenibacillus'},
 'refseq/assembly/GCF_900021165.1': {'tax_strain': 'MT24',
  'tax_species': 'sp. MT24',
  'organism': 'Paenibacillus sp. MT24 (firmicutes)',
  'tax_genus': 'Paenibacillus'}}

## 160906 $\to$ 200521

In [13]:
assert set(gset_genomes[1]) >= set(gset_genomes[2])

In [14]:
missing_200521 = set(gset_genomes[1]) - set(gset_genomes[2])
missing_200521

{'refseq/assembly/GCF_000026325.1',
 'refseq/assembly/GCF_000220025.2',
 'refseq/assembly/GCF_000367945.1',
 'refseq/assembly/GCF_000487935.1',
 'refseq/assembly/GCF_000542635.1'}

In [15]:
{k: gset_genomes[1][k] for k in missing_200521}

{'refseq/assembly/GCF_000220025.2': {'tax_strain': 'AES-1R',
  'tax_genus': 'Pseudomonas',
  'organism': 'Pseudomonas aeruginosa AES-1R (g-proteobacteria)',
  'tax_species': 'aeruginosa'},
 'refseq/assembly/GCF_000367945.1': {'tax_strain': 'NIPH 809',
  'tax_genus': 'Acinetobacter',
  'organism': 'Acinetobacter sp. NIPH 809 (g-proteobacteria)',
  'tax_species': 'sp. NIPH 809'},
 'refseq/assembly/GCF_000487935.1': {'tax_strain': 'CFSAN000624',
  'tax_genus': 'Salmonella',
  'organism': 'Salmonella enterica subsp. enterica serovar Stanleyville str. CFSAN000624 (enterobacteria)',
  'tax_species': 'enterica'},
 'refseq/assembly/GCF_000026325.1': {'tax_strain': 'UMN026',
  'tax_genus': 'Escherichia',
  'organism': 'Escherichia coli UMN026 (E. coli)',
  'tax_species': 'coli'},
 'refseq/assembly/GCF_000542635.1': {'tax_strain': 'GGMC6027',
  'tax_genus': 'Staphylococcus',
  'organism': 'Staphylococcus aureus GGMC6027 (firmicutes)',
  'tax_species': 'aureus'}}

## Compile