# 201031 migrate genomes

In [1]:
import json
from pathlib import Path
from zipfile import ZipFile
from gzip import GzipFile

from tqdm import tqdm
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [2]:
from midas.db.models import Genome, ReferenceGenomeSet, Taxon, AnnotatedGenome
from midas.db.migrate import init_db

## File paths

In [3]:
infiles = dict(
    v1_archive='/home/jared/projects/midas/data/v1/archives/refseq_curated_1.1_beta_200525.midas-archive.gz',
)

In [4]:
dbfile = Path('./db.sqlite')

In [5]:
tmpdir = Path('tmp')
tmpdir.mkdir(exist_ok=True)

## Open v1.1 data

In [6]:
archive_zip = ZipFile(GzipFile(infiles['v1_archive']))
archive_zip.read('info').decode()

'{"archive_version": "1.0"}'

## Initialize database

In [7]:
if dbfile.is_file():
    dbfile.unlink()

In [8]:
engine = create_engine('sqlite:///%s' % dbfile)
Session = sessionmaker(engine)

In [9]:
init_db(engine)

In [10]:
session = Session()

## Convert genomes

In [11]:
genomefiles = [n for n in archive_zip.namelist() if n.startswith('genomes/')]

In [12]:
genome_taxids = set()

In [13]:
for fname in tqdm(genomefiles):
    with archive_zip.open(fname) as f:
        gdata = json.load(f)
        
    summary = gdata['gb_summary']

    # Accession #s
    refseq_acc = gdata['gb_acc']
    assert refseq_acc.startswith('GCF_')
    assert summary['synonym']['refseq'] == refseq_acc
    genbank_acc = summary['synonym']['genbank']
    assert genbank_acc.startswith('GCA_')
    
    extra = dict(
        ncbi_taxid=gdata['gb_taxid'],
        sequence_source=gdata['meta']['sequence_source'],
    )
        
    genome = Genome(
        key=gdata['key'],
        version=gdata['key_version'],
        description=gdata['description'],
        entrez_db=gdata['gb_db'],
        entrez_id=gdata['gb_id'],
        genbank_acc=genbank_acc,
        refseq_acc=refseq_acc,
        extra=extra,
    )
    
    session.add(genome)
    
    genome_taxids.add(gdata['gb_taxid'])

100%|██████████| 50752/50752 [00:10<00:00, 4867.46it/s]


In [14]:
session.commit()

## Get taxonomy IDs for download

In [15]:
with open(tmpdir / 'genome_taxids.json', 'w') as f:
    json.dump(sorted(genome_taxids), f)