In [1]:
import getpass
import numpy as np
import os
import pandas as pd
import polars as pl
import pymysql 
import re
import shutil
import subprocess
import time

from datetime import datetime
from db_queries import get_demographics, get_ids, get_location_metadata, get_population, get_sequela_metadata
from save_results import save_results_epi, save_results_cod

release = 16
level_3 = 'ints' # set to either 'ints' or 'intest'
clear_output_dir = False
best = True

bundles = {'intest': 556, 'ints':3785}
xwalks = {'intest': 17891, 'ints': 7298}
    
input_dir = f'/ihme/scratch/users/stanaway/{level_3}/release_{release}/inputs/'
output_dir = f'/ihme/scratch/users/stanaway/{level_3}/release_{release}/draws/'

# Make a list of all of the cause and modelable entity ids for which we'll produce output
seq_meta = get_sequela_metadata(sequela_set_id=2, release_id=release)

if level_3 == 'intest':
    cid_list = [319, 320]
    seq_meta = seq_meta.loc[seq_meta.cause_id.isin(cid_list), 'modelable_entity_id'].tolist()
    upload_ids = cid_list + seq_meta
    output_ids = upload_ids + [2523, 23991, 23992]
elif level_3 == 'ints':
    cid_list = [959]
    seq_meta = seq_meta.loc[seq_meta.cause_id.isin(cid_list), 'modelable_entity_id'].tolist()
    upload_ids = cid_list + seq_meta + [27540, 28000]
    intermediate_ids = [9999, 9959, 196800, 196801]
    output_ids = upload_ids + intermediate_ids
else:
    print('Value of level_3 must be either "intest" or "ints".')

for output_id in output_ids + ['model_info']:
    if clear_output_dir == True:
        shutil.rmtree(os.path.join(output_dir, str(output_id)))
    os.makedirs(os.path.join(output_dir, str(output_id)), exist_ok = True)
    
    
cod_demog = get_demographics(gbd_team="cod", release_id = release)
locs = cod_demog['location_id']

if level_3 == 'intest':
    loc_meta = get_location_metadata(35, release_id = release)
    ind_locs = loc_meta[loc_meta.path_to_top_parent.str.contains(',163,')]['location_id'].tolist()

            
user = getpass.getuser()

In [2]:
launch_time = datetime.now()

for loc in locs:
    if level_3 == 'intest' and loc in ind_locs:
        mem = '100G'
    else:
        mem = '20G'

    submission_list = ['sbatch', '-J', f'{level_3}_{loc}', '-e', f'/ihme/temp/slurmoutput/{user}/errors/%x.e%j.txt',
                       '-o', f'/ihme/temp/slurmoutput/{user}/output/%x.o%j.txt', '-A', 'proj_erf',
                       f'--mem={mem}', '-c', '4', '-t', '600', '-p', 'all.q', 
                       '/ihme/homes/stanaway/py_shell.sh ' +  '/ihme/homes/stanaway/enteric_split.py ' + str(loc) + ' ' + str(release) + ' ' + str(level_3)]
    
    submission_str = " ".join(submission_list)
    os.system(submission_str)
    
while True:
    time.sleep(30)
    
    all_jobs = subprocess.check_output('squeue --me -o "%j"', shell = True)
    all_jobs = all_jobs.decode().split("\n")[1:]

    matching_jobs = [job for job in all_jobs if re.match(f"^{level_3}", job)]
    if len(matching_jobs) == 0:
        print('All jobs done running. Checking output files.')
        break
    else:
        print('Jobs still running.  Will check again in a minute.')
        

Submitted batch job 65794280
Submitted batch job 65794281
Submitted batch job 65794282
Submitted batch job 65794283
Submitted batch job 65794284
Submitted batch job 65794285
Submitted batch job 65794286
Submitted batch job 65794287
Submitted batch job 65794288
Submitted batch job 65794289
Submitted batch job 65794290
Submitted batch job 65794291
Submitted batch job 65794292
Submitted batch job 65794293
Submitted batch job 65794294
Submitted batch job 65794295
Submitted batch job 65794296
Submitted batch job 65794297
Submitted batch job 65794298
Submitted batch job 65794299
Submitted batch job 65794300
Submitted batch job 65794301
Submitted batch job 65794302
Submitted batch job 65794303
Submitted batch job 65794304
Submitted batch job 65794305
Submitted batch job 65794306
Submitted batch job 65794307
Submitted batch job 65794308
Submitted batch job 65794309
Submitted batch job 65794310
Submitted batch job 65794311
Submitted batch job 65794312
Submitted batch job 65794313
Submitted batc

In [3]:
def file_checker(file, launch):
    exists = os.path.isfile(file)
    if exists: 
        mtime = datetime.fromtimestamp(os.path.getmtime(file))
        if mtime > launch_time:
            return 'new'
        else:
            return 'old'
    else:
        return 'missing'

    
checks = [[id, loc, file_checker(os.path.join(output_dir, str(id), f"{loc}.csv"), launch_time)] for id in output_ids for loc in locs]
checks = pd.DataFrame(checks, columns = ['meid', 'location_id', 'status'])
checks['complete'] = checks['status'] == 'new'

ready = checks.groupby(['meid'])['complete'].mean().reset_index()

meids_to_upload = ready.loc[ready.complete==1, 'meid']
meids_to_upload = [x for x in meids_to_upload if x in upload_ids]
print(checks.groupby('meid')['status'].value_counts())

meid   status
319    new       843
320    new       843
1249   new       843
1250   new       843
1251   new       843
1253   new       843
1254   new       843
1255   new       843
1256   new       843
2523   new       843
3134   new       843
23991  new       843
23992  new       843
Name: status, dtype: int64


In [4]:
print(meids_to_upload)
print(upload_ids)
missing_upload_ids = set(upload_ids) - set(meids_to_upload)
if len(missing_upload_ids) == 0:
    print("Results are complete for all upload IDs")
else:
    print("The following upload ids do not have complete results: " + str(missing_upload_ids))

[319, 320, 1249, 1250, 1251, 1253, 1254, 1255, 1256, 3134]
[319, 320, 3134, 1251, 1249, 1250, 1256, 1253, 1254, 1255]
Results are complete for all upload IDs


In [5]:
model_info = (pl.scan_csv(os.path.join(output_dir, 'model_info', '*.csv')).collect(streaming = True)).unique()
label = []
for tool, group in model_info.group_by('tool'):
    label.append(tool + ' = ' + ', '.join([str(id) for id in group['model_version_id']]))

label = '; and '.join(label)
description = f'Natural hx / CODEm hybrid using {label}, with python pipeline'
print(description)

Natural hx / CODEm hybrid using codem = 739996, 739994, 739997, 740256; and dismod = 799168, 799167, 799169, with python pipeline


In [6]:
for id in meids_to_upload:
    if id in cid_list:
        type = 'cod'
        mem = '200G'
        measures = 1

    else:
        type = 'epi'
        mem = '100G'

        m_test = pl.read_csv(os.path.join(output_dir, str(id), '161.csv'))
        measures = ' '.join(map(str, m_test['measure_id'].unique()))

    best = 'True'

    submission_list = ['sbatch', '-J', f'upload_{id}', '-e', f'/ihme/temp/slurmoutput/{user}/errors/%x.e%j.txt',
                       '-o', f'/ihme/temp/slurmoutput/{user}/output/%x.o%j.txt', '-A', 'proj_erf',
                       f'--mem={mem}', '-c', '4', '-t', '1000', '-p', 'all.q', 
                       '/ihme/homes/stanaway/py_shell.sh ' +  '/ihme/homes/stanaway/apTest.py ' + 
                       f'--type {type} --id {id} --path {os.path.join(output_dir, str(id))} --description "{description}" ' +
                       f'--measure {measures} --best {best} --release {release} --bundle {bundles[level_3]} --xwalk {xwalks[level_3]}']

    submission_str = " ".join(submission_list)
    print(submission_str)
    os.system(submission_str)

sbatch -J upload_1249 -e /ihme/temp/slurmoutput/stanaway/errors/%x.e%j.txt -o /ihme/temp/slurmoutput/stanaway/output/%x.o%j.txt -A proj_erf --mem=100G -c 4 -t 1000 -p all.q /ihme/homes/stanaway/py_shell.sh /ihme/homes/stanaway/apTest.py --type epi --id 1249 --path /ihme/scratch/users/stanaway/intest/release_16/draws/1249 --description "Natural hx / CODEm hybrid using codem = 739996, 739994, 739997, 740256; and dismod = 799168, 799167, 799169, with python pipeline" --measure 5 6 --best True --release 16 --bundle 556 --xwalk 17891
Submitted batch job 65795407
sbatch -J upload_1250 -e /ihme/temp/slurmoutput/stanaway/errors/%x.e%j.txt -o /ihme/temp/slurmoutput/stanaway/output/%x.o%j.txt -A proj_erf --mem=100G -c 4 -t 1000 -p all.q /ihme/homes/stanaway/py_shell.sh /ihme/homes/stanaway/apTest.py --type epi --id 1250 --path /ihme/scratch/users/stanaway/intest/release_16/draws/1250 --description "Natural hx / CODEm hybrid using codem = 739996, 739994, 739997, 740256; and dismod = 799168, 79916

In [7]:
def get_latest_dr_codem_models(cause_ids, release):
    cause_ids_str = ", ".join([str(x) for x in cause_ids])
    varlist = ['model_version_id', 'cause_id', 'sex_id', 'date_inserted']
    db = pymysql.connect(host = 'modeling-cod-db.ihme.washington.edu', user = 'dbview', password = 'E3QNSLvQTRJm', database = 'cod') 
    
    with db:
        with db.cursor() as cursor:
            cursor.execute(f'SELECT {", ".join(varlist)} FROM model_version \
            WHERE cause_id IN ({cause_ids_str}) AND model_version_type_id = 2 AND status IN (1,2) AND release_id = {release}') 
    
            # Fetch all rows of data, put them in a data frame and add column names
            cod_models = pd.DataFrame(cursor.fetchall(), columns = varlist)
    
    latest = cod_models.groupby(['cause_id', 'sex_id'])['model_version_id'].max()
    cod_models = cod_models[cod_models.model_version_id.isin(latest)]
    cod_models = cod_models.merge(get_ids('sex'), on = 'sex_id', how = 'left') \
                           .merge(get_ids('cause'), on = 'cause_id', how = 'left')
    cod_models.loc[cod_models.acause == 'intest_paratyph', 'acause'] = 'intest_paratyphoid'
    cod_models['dict_key'] = (cod_models['sex'].astype(str)).str.lower() + "_" + cod_models['acause'].str.replace('intest_', '')
    
    cids = cod_models[['dict_key','model_version_id']].set_index('dict_key')['model_version_id'].to_dict() 
    return cids

#codem_ids = [get_latest_dr_codem_models(c) for c in [319, 320, 959]]
#codem_ids = get_latest_dr_codem_models([319, 320, 959])
#print(codem_ids)