# 210717 Find sequence URLs

In [None]:
from pathlib import Path
import json

In [2]:
import pandas as pd
from tqdm import tqdm
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

In [3]:
from midas.database.basicdatabase import GenomeSet, Genome

## Setup

In [4]:
DATESTR = '210717'
NBNAME = DATESTR + '-find-sequence-urls'

In [5]:
infiles = dict(
    db=Path('/home/jared/projects/gambit/data/databases/refseq-curated/midas-1.1/refseq-curated-1.1b-200525.db'),
    genomes=Path('../../data/intermediate/210303-database-v2-overlaps/210303-format-data/genomes-v1.1.csv'),
)

In [6]:
data_intermediate = Path('data-intermediate') / NBNAME
data_intermediate.mkdir(exist_ok=True)

## Load data

### Genomes table

In [7]:
genomes_df = pd.read_csv(infiles['genomes'])

In [8]:
accs = genomes_df['accession']

### Database

In [9]:
engine = create_engine('sqlite:///' + str(infiles['db']))

In [10]:
Session = sessionmaker(engine)
session = Session()

In [11]:
gset = session.query(GenomeSet).one()

In [12]:
assert len(accs) == session.query(Genome).count()

### Sequence sources

In [13]:
sequence_sources = {
    genome.gb_acc: genome.meta['sequence_source'].as_builtin()
    for genome in tqdm(session.query(Genome), total=len(accs))
}

100%|██████████| 50752/50752 [00:42<00:00, 1183.55it/s]


## Write output

In [14]:
data_out = []

for acc in accs:
    item = dict(accession=acc)
    item.update(sequence_sources[acc])
    data_out.append(item)

In [15]:
with open(str(data_intermediate / 'seq-urls.json'), 'w') as f:
    json.dump(data_out, f)